From 7051e794ceb6399429ab1b961a13e6876ea93943 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Wed, 27 Jan 2021 15:21:13 -0800
Subject: [PATCH 001/318] Drop the 'git' suffix from various version variables

---
 libcxx/CMakeLists.txt    | 2 +-
 libcxxabi/CMakeLists.txt | 2 +-
 libunwind/CMakeLists.txt | 2 +-
 llvm/CMakeLists.txt      | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index 4e7e8f978546..9bf1a02f0908 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -29,7 +29,7 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR OR LIBCXX_STANDALONE_BUIL
   project(libcxx CXX C)
 
   set(PACKAGE_NAME libcxx)
-  set(PACKAGE_VERSION 12.0.0git)
+  set(PACKAGE_VERSION 12.0.0)
   set(PACKAGE_STRING "${PACKAGE_NAME} ${PACKAGE_VERSION}")
   set(PACKAGE_BUGREPORT "llvm-bugs@lists.llvm.org")
 
diff --git a/libcxxabi/CMakeLists.txt b/libcxxabi/CMakeLists.txt
index b803347c2a8e..426c855288fc 100644
--- a/libcxxabi/CMakeLists.txt
+++ b/libcxxabi/CMakeLists.txt
@@ -28,7 +28,7 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR OR LIBCXXABI_STANDALONE_B
   project(libcxxabi CXX C)
 
   set(PACKAGE_NAME libcxxabi)
-  set(PACKAGE_VERSION 11.0.0git)
+  set(PACKAGE_VERSION 11.0.0)
   set(PACKAGE_STRING "${PACKAGE_NAME} ${PACKAGE_VERSION}")
   set(PACKAGE_BUGREPORT "llvm-bugs@lists.llvm.org")
 
diff --git a/libunwind/CMakeLists.txt b/libunwind/CMakeLists.txt
index 8ae32fbccf4e..48cb8e004e08 100644
--- a/libunwind/CMakeLists.txt
+++ b/libunwind/CMakeLists.txt
@@ -24,7 +24,7 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR OR LIBUNWIND_STANDALONE_B
   project(libunwind LANGUAGES C CXX ASM)
 
   set(PACKAGE_NAME libunwind)
-  set(PACKAGE_VERSION 12.0.0git)
+  set(PACKAGE_VERSION 12.0.0)
   set(PACKAGE_STRING "${PACKAGE_NAME} ${PACKAGE_VERSION}")
   set(PACKAGE_BUGREPORT "llvm-bugs@lists.llvm.org")
 
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 454ec561af9a..277d0fe54d7b 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -14,7 +14,7 @@ if(NOT DEFINED LLVM_VERSION_PATCH)
   set(LLVM_VERSION_PATCH 0)
 endif()
 if(NOT DEFINED LLVM_VERSION_SUFFIX)
-  set(LLVM_VERSION_SUFFIX git)
+  set(LLVM_VERSION_SUFFIX "")
 endif()
 
 if (NOT PACKAGE_VERSION)

From f2a45d31b9c11f2b3e12f161391fe845025b5177 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Wed, 27 Jan 2021 15:17:48 -0800
Subject: [PATCH 002/318] Import workflows from release/11.x branch

---
 .github/workflows/clang-tests.yml  |  43 +++++++++++
 .github/workflows/libclc-tests.yml |  53 +++++++++++++
 .github/workflows/lld-tests.yml    |  43 +++++++++++
 .github/workflows/lldb-tests.yml   |  48 ++++++++++++
 .github/workflows/llvm-tests.yml   | 116 +++++++++++++++++++++++++++++
 5 files changed, 303 insertions(+)
 create mode 100644 .github/workflows/clang-tests.yml
 create mode 100644 .github/workflows/libclc-tests.yml
 create mode 100644 .github/workflows/lld-tests.yml
 create mode 100644 .github/workflows/lldb-tests.yml
 create mode 100644 .github/workflows/llvm-tests.yml

diff --git a/.github/workflows/clang-tests.yml b/.github/workflows/clang-tests.yml
new file mode 100644
index 000000000000..f8ca65e10726
--- /dev/null
+++ b/.github/workflows/clang-tests.yml
@@ -0,0 +1,43 @@
+name: Clang Tests
+
+on:
+  push:
+    branches:
+      - 'release/**'
+    paths:
+      - 'clang/**'
+      - 'llvm/**'
+      - '.github/workflows/clang-tests.yml'
+  pull_request:
+    paths:
+      - 'clang/**'
+      - 'llvm/**'
+      - '.github/workflows/clang-tests.yml'
+
+jobs:
+  build_clang:
+    name: clang check-all
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os:
+          - ubuntu-latest
+          - windows-latest
+          - macOS-latest
+    steps:
+    - name: Setup Windows
+      if: startsWith(matrix.os, 'windows')
+      uses: llvm/actions/setup-windows@master
+      with:
+        arch: amd64
+    - name: Install Ninja
+      uses: llvm/actions/install-ninja@master
+    - uses: actions/checkout@v1
+      with:
+        fetch-depth: 1
+    - name: Test clang
+      uses: llvm/actions/build-test-llvm-project@master
+      with:
+        cmake_args: -G Ninja  -DLLVM_ENABLE_PROJECTS="clang" -DCMAKE_BUILD_TYPE=Release
+        build_target: check-clang
diff --git a/.github/workflows/libclc-tests.yml b/.github/workflows/libclc-tests.yml
new file mode 100644
index 000000000000..4e8639b1c89a
--- /dev/null
+++ b/.github/workflows/libclc-tests.yml
@@ -0,0 +1,53 @@
+name: libclc Tests
+
+on:
+  push:
+    branches:
+      - 'release/**'
+    paths:
+      - 'clang/**'
+      - 'llvm/**'
+      - 'libclc/**'
+      - '.github/workflows/libclc-tests.yml'
+  pull_request:
+    paths:
+      - 'clang/**'
+      - 'llvm/**'
+      - 'libclc/**'
+      - '.github/workflows/libclc-tests.yml'
+
+jobs:
+  build_libclc:
+    name: libclc test
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os:
+          - ubuntu-latest
+          # Disable build on windows, because I can't figure out where llvm-config is.
+          #- windows-latest
+          - macOS-latest
+    steps:
+    - name: Setup Windows
+      if: startsWith(matrix.os, 'windows')
+      uses: llvm/actions/setup-windows@master
+      with:
+        arch: amd64
+    - name: Install Ninja
+      uses: llvm/actions/install-ninja@master
+    - uses: actions/checkout@v1
+      with:
+        fetch-depth: 1
+    - name: Build clang
+      uses: llvm/actions/build-test-llvm-project@master
+      with:
+        cmake_args: -G Ninja  -DLLVM_ENABLE_PROJECTS="clang" -DCMAKE_BUILD_TYPE=Release
+        build_target: ""
+    - name: Build and test libclc
+      run: |
+        mkdir libclc-build
+        cd libclc-build
+        cmake -G Ninja ../libclc -DLLVM_CONFIG=../build/bin/llvm-config
+        ninja
+        ninja test
diff --git a/.github/workflows/lld-tests.yml b/.github/workflows/lld-tests.yml
new file mode 100644
index 000000000000..9b4cbe95f231
--- /dev/null
+++ b/.github/workflows/lld-tests.yml
@@ -0,0 +1,43 @@
+name: LLD Tests
+
+on:
+  push:
+    branches:
+      - 'release/**'
+    paths:
+      - 'lld/**'
+      - 'llvm/**'
+      - '.github/workflows/lld-tests.yml'
+  pull_request:
+    paths:
+      - 'lld/**'
+      - 'llvm/**'
+      - '.github/workflows/lld-tests.yml'
+
+jobs:
+  build_lld:
+    name: lld check-all
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os:
+          - ubuntu-latest
+          - windows-latest
+          - macOS-latest
+    steps:
+    - name: Setup Windows
+      if: startsWith(matrix.os, 'windows')
+      uses: llvm/actions/setup-windows@master
+      with:
+        arch: amd64
+    - name: Install Ninja
+      uses: llvm/actions/install-ninja@master
+    - uses: actions/checkout@v1
+      with:
+        fetch-depth: 1
+    - name: Test lld
+      uses: llvm/actions/build-test-llvm-project@master
+      with:
+        cmake_args: -G Ninja  -DLLVM_ENABLE_PROJECTS="lld" -DCMAKE_BUILD_TYPE=Release
+        build_target: check-lld
diff --git a/.github/workflows/lldb-tests.yml b/.github/workflows/lldb-tests.yml
new file mode 100644
index 000000000000..229e6deece6e
--- /dev/null
+++ b/.github/workflows/lldb-tests.yml
@@ -0,0 +1,48 @@
+name: lldb Tests
+
+on:
+  push:
+    branches:
+      - 'release/**'
+    paths:
+      - 'clang/**'
+      - 'llvm/**'
+      - 'lldb/**'
+      - '.github/workflows/lldb-tests.yml'
+  pull_request:
+    paths:
+      - 'clang/**'
+      - 'llvm/**'
+      - 'lldb/**'
+      - '.github/workflows/lldb-tests.yml'
+
+jobs:
+  build_lldb:
+    name: lldb build
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os:
+          - ubuntu-latest
+          - windows-latest
+          # macOS build disabled due to: llvm.org/PR46190
+          #- macOS-latest
+    steps:
+    - name: Setup Windows
+      if: startsWith(matrix.os, 'windows')
+      uses: llvm/actions/setup-windows@master
+      with:
+        arch: amd64
+    - name: Install Ninja
+      uses: llvm/actions/install-ninja@master
+    - uses: actions/checkout@v1
+      with:
+        fetch-depth: 1
+    - name: Build lldb
+      uses: llvm/actions/build-test-llvm-project@master
+      with:
+        # Mac OS requries that libcxx is enabled for lldb tests, so we need  to disable them.
+        cmake_args: -G Ninja  -DLLVM_ENABLE_PROJECTS="clang;lldb" -DCMAKE_BUILD_TYPE=Release -DLLDB_INCLUDE_TESTS=OFF
+        # check-lldb is not consistent, so we only build lldb.
+        build_target: ""
diff --git a/.github/workflows/llvm-tests.yml b/.github/workflows/llvm-tests.yml
new file mode 100644
index 000000000000..67f318ad849f
--- /dev/null
+++ b/.github/workflows/llvm-tests.yml
@@ -0,0 +1,116 @@
+name: LLVM Tests
+
+env:
+  release_major: 12
+
+on:
+  push:
+    branches:
+      - 'release/**'
+    paths:
+      - 'llvm/**'
+      - '.github/workflows/llvm-tests.yml'
+  pull_request:
+    paths:
+      - 'llvm/**'
+      - '.github/workflows/llvm-tests.yml'
+
+jobs:
+  build_llvm:
+    name: llvm check-all
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os:
+          - ubuntu-latest
+          - windows-latest
+          - macOS-latest
+    steps:
+    - name: Setup Windows
+      if: startsWith(matrix.os, 'windows')
+      uses: llvm/actions/setup-windows@master
+      with:
+        arch: amd64
+    - name: Install Ninja
+      uses: llvm/actions/install-ninja@master
+    - uses: actions/checkout@v1
+      with:
+        fetch-depth: 1
+    - name: Test llvm
+      uses: llvm/actions/build-test-llvm-project@master
+      with:
+        cmake_args: -G Ninja -DCMAKE_BUILD_TYPE=Release
+
+  abi-dump:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        name:
+          - build-baseline
+          - build-latest
+        include:
+          - name: build-baseline
+            # FIXME: Referencing the env context does not work here
+            # ref: llvmorg-${{ env.release_major }}.0.0
+            ref: llvmorg-12.0.0
+            repo: llvm/llvm-project
+          - name: build-latest
+            ref: ${{ github.sha }}
+            repo: ${{ github.repository }}
+    steps:
+    - name: Install Ninja
+      uses: llvm/actions/install-ninja@master
+    - name: Install abi-compliance-checker
+      run: |
+        sudo apt-get install abi-dumper autoconf pkg-config
+    - name: Install universal-ctags
+      run: |
+        git clone https://github.com/universal-ctags/ctags.git
+        cd ctags
+        ./autogen.sh
+        ./configure
+        sudo make install
+    - name: Download source code
+      uses: llvm/actions/get-llvm-project-src@master
+      with:
+        ref: ${{ matrix.ref }}
+        repo: ${{ matrix.repo }}
+    - name: Configure
+      run: |
+        mkdir build
+        cd build
+        cmake -G Ninja -DCMAKE_BUILD_TYPE=Debug -DLLVM_TARGETS_TO_BUILD="" -DLLVM_BUILD_LLVM_DYLIB=ON -DCMAKE_C_FLAGS_DEBUG="-g -Og" -DCMAKE_CXX_FLAGS_DEBUG="-g -Og" ../llvm
+    - name: Build
+      run: ninja -C build libLLVM-${{ env.release_major }}.so
+    - name: Dump ABI
+      run: abi-dumper -lver ${{ matrix.ref }} -skip-cxx -public-headers llvm/include -o ${{ matrix.ref }}.abi.tar.gz build/lib/libLLVM-${{ env.release_major }}.so
+    - name: Upload ABI file
+      uses: actions/upload-artifact@v1
+      with:
+        name: ${{ matrix.name }}
+        path: ${{ matrix.ref }}.abi.tar.gz
+
+  abi-compare:
+    runs-on: ubuntu-latest
+    needs:
+      - abi-dump
+    steps:
+      - name: Download baseline
+        uses: actions/download-artifact@v1
+        with:
+          name: build-baseline
+      - name: Download latest
+        uses: actions/download-artifact@v1
+        with:
+          name: build-latest
+      - name: Install abi-compliance-checker
+        run: sudo apt-get install abi-compliance-checker
+      - name: Compare ABI
+        run: abi-compliance-checker -l libLLVM-${{ env.release_major}}.so -old build-baseline/*.tar.gz -new build-latest/*.tar.gz
+      - name: Upload ABI Comparison
+        if: always()
+        uses: actions/upload-artifact@v1
+        with:
+          name: compat-report-${{ github.sha }}
+          path: compat_reports/

From d64226e8fab8fc7b4d947223c61036a60eb6a871 Mon Sep 17 00:00:00 2001
From: Sam McCall <sam.mccall@gmail.com>
Date: Wed, 27 Jan 2021 15:32:05 +0100
Subject: [PATCH 003/318] [clangd] Work around GCC bug 66735

(cherry picked from commit 12de8e1399fecf691639ba430b3824acb1311e70)
---
 clang-tools-extra/clangd/ParsedAST.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang-tools-extra/clangd/ParsedAST.cpp b/clang-tools-extra/clangd/ParsedAST.cpp
index 403d3fe3e64f..1020282f5ee8 100644
--- a/clang-tools-extra/clangd/ParsedAST.cpp
+++ b/clang-tools-extra/clangd/ParsedAST.cpp
@@ -316,8 +316,8 @@ ParsedAST::build(llvm::StringRef Filename, const ParseInputs &Inputs,
       Check->registerMatchers(&CTFinder);
     }
 
-    ASTDiags.setLevelAdjuster([&, &Cfg(Config::current())](
-                                  DiagnosticsEngine::Level DiagLevel,
+    const Config& Cfg = Config::current();
+    ASTDiags.setLevelAdjuster([&](DiagnosticsEngine::Level DiagLevel,
                                   const clang::Diagnostic &Info) {
       if (Cfg.Diagnostics.SuppressAll ||
           isBuiltinDiagnosticSuppressed(Info.getID(), Cfg.Diagnostics.Suppress))

From ea99c885a63de9af673a5e5cd51f44fb70c83c1b Mon Sep 17 00:00:00 2001
From: Richard Smith <richard@metafoo.co.uk>
Date: Wed, 27 Jan 2021 12:24:30 -0800
Subject: [PATCH 004/318] Permit __VA_OPT__ in all language modes and allow it
 to be detected with #ifdef.

These changes are intended to give code a path to move away from the GNU
,##__VA_ARGS__ extension, which is non-conforming in some situations and
which we'd like to disable in our conforming mode in those cases.

(cherry picked from commit 0436ec2128c9775ba13b0308937238fc79673fdd)
---
 clang/include/clang/Lex/Preprocessor.h        | 19 ++++++++++++
 .../include/clang/Lex/VariadicMacroSupport.h  | 10 ++----
 clang/lib/Lex/PPDirectives.cpp                |  5 +++
 clang/lib/Lex/PPExpressions.cpp               |  5 +++
 clang/lib/Lex/PPMacroExpansion.cpp            |  6 +++-
 clang/lib/Lex/Preprocessor.cpp                | 19 +++++-------
 clang/test/Preprocessor/macro_vaopt_check.cpp | 31 ++++++++++++++++++-
 .../test/Preprocessor/macro_vaopt_expand.cpp  |  4 ++-
 8 files changed, 78 insertions(+), 21 deletions(-)

diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 68139cb24b31..ba8bdaa23c4c 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -447,6 +447,25 @@ class Preprocessor {
           ElseLoc(ElseLoc) {}
   };
 
+  class IfdefMacroNameScopeRAII {
+    Preprocessor &PP;
+    bool VAOPTWasPoisoned;
+
+  public:
+    IfdefMacroNameScopeRAII(Preprocessor &PP)
+        : PP(PP), VAOPTWasPoisoned(PP.Ident__VA_OPT__->isPoisoned()) {
+      PP.Ident__VA_OPT__->setIsPoisoned(false);
+    }
+    IfdefMacroNameScopeRAII(const IfdefMacroNameScopeRAII&) = delete;
+    IfdefMacroNameScopeRAII &operator=(const IfdefMacroNameScopeRAII&) = delete;
+    ~IfdefMacroNameScopeRAII() { Exit(); }
+
+    void Exit() {
+      if (VAOPTWasPoisoned)
+        PP.Ident__VA_OPT__->setIsPoisoned(true);
+    }
+  };
+
 private:
   friend class ASTReader;
   friend class MacroArgs;
diff --git a/clang/include/clang/Lex/VariadicMacroSupport.h b/clang/include/clang/Lex/VariadicMacroSupport.h
index 989e0ac703c9..119f02201fc6 100644
--- a/clang/include/clang/Lex/VariadicMacroSupport.h
+++ b/clang/include/clang/Lex/VariadicMacroSupport.h
@@ -39,17 +39,14 @@ namespace clang {
       assert(Ident__VA_ARGS__->isPoisoned() && "__VA_ARGS__ should be poisoned "
                                               "outside an ISO C/C++ variadic "
                                               "macro definition!");
-      assert(
-          !Ident__VA_OPT__ ||
-          (Ident__VA_OPT__->isPoisoned() && "__VA_OPT__ should be poisoned!"));
+      assert(Ident__VA_OPT__->isPoisoned() && "__VA_OPT__ should be poisoned!");
     }
 
     /// Client code should call this function just before the Preprocessor is
     /// about to Lex tokens from the definition of a variadic (ISO C/C++) macro.
     void enterScope() {
       Ident__VA_ARGS__->setIsPoisoned(false);
-      if (Ident__VA_OPT__)
-        Ident__VA_OPT__->setIsPoisoned(false);
+      Ident__VA_OPT__->setIsPoisoned(false);
     }
 
     /// Client code should call this function as soon as the Preprocessor has
@@ -58,8 +55,7 @@ namespace clang {
     /// (might be explicitly called, and then reinvoked via the destructor).
     void exitScope() {
       Ident__VA_ARGS__->setIsPoisoned(true);
-      if (Ident__VA_OPT__)
-        Ident__VA_OPT__->setIsPoisoned(true);
+      Ident__VA_OPT__->setIsPoisoned(true);
     }
 
     ~VariadicMacroScopeGuard() { exitScope(); }
diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp
index d6b03d85913d..e2aa93455ea5 100644
--- a/clang/lib/Lex/PPDirectives.cpp
+++ b/clang/lib/Lex/PPDirectives.cpp
@@ -2928,9 +2928,14 @@ void Preprocessor::HandleIfdefDirective(Token &Result,
   ++NumIf;
   Token DirectiveTok = Result;
 
+  // __VA_OPT__ is allowed as the operand of #if[n]def.
+  IfdefMacroNameScopeRAII IfdefMacroNameScope(*this);
+
   Token MacroNameTok;
   ReadMacroName(MacroNameTok);
 
+  IfdefMacroNameScope.Exit();
+
   // Error reading macro name?  If so, diagnostic already issued.
   if (MacroNameTok.is(tok::eod)) {
     // Skip code until we get to #endif.  This helps with recovery by not
diff --git a/clang/lib/Lex/PPExpressions.cpp b/clang/lib/Lex/PPExpressions.cpp
index 8c120c13d7d2..952fb8f121dc 100644
--- a/clang/lib/Lex/PPExpressions.cpp
+++ b/clang/lib/Lex/PPExpressions.cpp
@@ -104,6 +104,9 @@ static bool EvaluateDefined(PPValue &Result, Token &PeekTok, DefinedTracker &DT,
   SourceLocation beginLoc(PeekTok.getLocation());
   Result.setBegin(beginLoc);
 
+  // __VA_OPT__ is allowed as the operand of 'defined'.
+  Preprocessor::IfdefMacroNameScopeRAII IfdefMacroNameScope(PP);
+
   // Get the next token, don't expand it.
   PP.LexUnexpandedNonComment(PeekTok);
 
@@ -122,6 +125,8 @@ static bool EvaluateDefined(PPValue &Result, Token &PeekTok, DefinedTracker &DT,
     PP.LexUnexpandedNonComment(PeekTok);
   }
 
+  IfdefMacroNameScope.Exit();
+
   // If we don't have a pp-identifier now, this is an error.
   if (PP.CheckMacroName(PeekTok, MU_Other))
     return true;
diff --git a/clang/lib/Lex/PPMacroExpansion.cpp b/clang/lib/Lex/PPMacroExpansion.cpp
index 43d31d6c5732..f6ca04defeb9 100644
--- a/clang/lib/Lex/PPMacroExpansion.cpp
+++ b/clang/lib/Lex/PPMacroExpansion.cpp
@@ -323,13 +323,16 @@ void Preprocessor::dumpMacroInfo(const IdentifierInfo *II) {
 
 /// RegisterBuiltinMacro - Register the specified identifier in the identifier
 /// table and mark it as a builtin macro to be expanded.
-static IdentifierInfo *RegisterBuiltinMacro(Preprocessor &PP, const char *Name){
+static IdentifierInfo *RegisterBuiltinMacro(Preprocessor &PP, const char *Name,
+                                            bool Disabled = false) {
   // Get the identifier.
   IdentifierInfo *Id = PP.getIdentifierInfo(Name);
 
   // Mark it as being a macro that is builtin.
   MacroInfo *MI = PP.AllocateMacroInfo(SourceLocation());
   MI->setIsBuiltinMacro();
+  if (Disabled)
+    MI->DisableMacro();
   PP.appendDefMacroDirective(Id, MI);
   return Id;
 }
@@ -343,6 +346,7 @@ void Preprocessor::RegisterBuiltinMacros() {
   Ident__TIME__ = RegisterBuiltinMacro(*this, "__TIME__");
   Ident__COUNTER__ = RegisterBuiltinMacro(*this, "__COUNTER__");
   Ident_Pragma  = RegisterBuiltinMacro(*this, "_Pragma");
+  Ident__VA_OPT__ = RegisterBuiltinMacro(*this, "__VA_OPT__", true);
 
   // C++ Standing Document Extensions.
   if (getLangOpts().CPlusPlus)
diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index 94f1ce91f884..9baba204b324 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -115,23 +115,20 @@ Preprocessor::Preprocessor(std::shared_ptr<PreprocessorOptions> PPOpts,
 
   BuiltinInfo = std::make_unique<Builtin::Context>();
 
-  // "Poison" __VA_ARGS__, __VA_OPT__ which can only appear in the expansion of
-  // a macro. They get unpoisoned where it is allowed.
-  (Ident__VA_ARGS__ = getIdentifierInfo("__VA_ARGS__"))->setIsPoisoned();
-  SetPoisonReason(Ident__VA_ARGS__,diag::ext_pp_bad_vaargs_use);
-  if (getLangOpts().CPlusPlus20) {
-    (Ident__VA_OPT__ = getIdentifierInfo("__VA_OPT__"))->setIsPoisoned();
-    SetPoisonReason(Ident__VA_OPT__,diag::ext_pp_bad_vaopt_use);
-  } else {
-    Ident__VA_OPT__ = nullptr;
-  }
-
   // Initialize the pragma handlers.
   RegisterBuiltinPragmas();
 
   // Initialize builtin macros like __LINE__ and friends.
   RegisterBuiltinMacros();
 
+  // "Poison" __VA_ARGS__, __VA_OPT__ which can only appear in the expansion of
+  // a macro. They get unpoisoned where it is allowed. Note that we model
+  // __VA_OPT__ as a builtin macro to allow #ifdef and friends to detect it.
+  (Ident__VA_ARGS__ = getIdentifierInfo("__VA_ARGS__"))->setIsPoisoned();
+  SetPoisonReason(Ident__VA_ARGS__, diag::ext_pp_bad_vaargs_use);
+  Ident__VA_OPT__->setIsPoisoned();
+  SetPoisonReason(Ident__VA_OPT__, diag::ext_pp_bad_vaopt_use);
+
   if(LangOpts.Borland) {
     Ident__exception_info        = getIdentifierInfo("_exception_info");
     Ident___exception_info       = getIdentifierInfo("__exception_info");
diff --git a/clang/test/Preprocessor/macro_vaopt_check.cpp b/clang/test/Preprocessor/macro_vaopt_check.cpp
index fb52e9946af3..84f3b85871dd 100644
--- a/clang/test/Preprocessor/macro_vaopt_check.cpp
+++ b/clang/test/Preprocessor/macro_vaopt_check.cpp
@@ -1,4 +1,20 @@
-// RUN: %clang_cc1 %s -Eonly -verify -Wno-all -pedantic -std=c++2a
+// RUN: %clang_cc1 %s -Eonly -verify -Wno-all -pedantic -std=c++20
+// RUN: %clang_cc1 %s -Eonly -verify -Wno-all -pedantic -std=c++11
+// RUN: %clang_cc1 -x c %s -Eonly -verify -Wno-all -pedantic -std=c99
+
+// Check that support for __VA_OPT__ can be detected by #ifdef.
+#ifndef __VA_OPT__
+#error should be defined
+#endif
+
+#ifdef __VA_OPT__
+#else
+#error should be defined
+#endif
+
+#if !defined(__VA_OPT__)
+#error should be defined
+#endif
 
 //expected-error@+1{{missing '('}}
 #define V1(...) __VA_OPT__  
@@ -62,3 +78,16 @@
 #define V1(...) __VA_OPT__  ((())
 #undef V1
 
+// __VA_OPT__ can't appear anywhere else.
+#if __VA_OPT__ // expected-warning {{__VA_OPT__ can only appear in the expansion of a variadic macro}}
+#endif
+
+#define BAD __VA_OPT__ // expected-warning {{__VA_OPT__ can only appear in the expansion of a variadic macro}}
+
+// Check defined(__VA_OPT__) doesn't leave __VA_OPT__ poisoned.
+#define Z(...) (0 __VA_OPT__(|| 1))
+#if defined(__VA_OPT__) && Z(hello)
+// OK
+#else
+#error bad
+#endif
diff --git a/clang/test/Preprocessor/macro_vaopt_expand.cpp b/clang/test/Preprocessor/macro_vaopt_expand.cpp
index 7ec4f6128cfa..5eb0facb83f7 100644
--- a/clang/test/Preprocessor/macro_vaopt_expand.cpp
+++ b/clang/test/Preprocessor/macro_vaopt_expand.cpp
@@ -1,4 +1,6 @@
-// RUN: %clang_cc1 -E %s -pedantic -std=c++2a | FileCheck -strict-whitespace %s
+// RUN: %clang_cc1 -E %s -pedantic -std=c++20 | FileCheck -strict-whitespace %s
+// RUN: %clang_cc1 -E %s -pedantic -std=c++11 | FileCheck -strict-whitespace %s
+// RUN: %clang_cc1 -E -x c %s -pedantic -std=c99 | FileCheck -strict-whitespace %s
 
 #define LPAREN ( 
 #define RPAREN ) 

From 9ea2a107ca4055a3a4960cb6dffb84b7f43bd8ea Mon Sep 17 00:00:00 2001
From: Richard Smith <richard@metafoo.co.uk>
Date: Wed, 27 Jan 2021 13:14:02 -0800
Subject: [PATCH 005/318] Don't allow __VA_OPT__ to be detected by #ifdef.

More study has discovered this to not actually be useful: because
current C++20 implementations reject `#ifdef __VA_OPT__`, this can't
really be used as a feature-test mechanism. And it's not too hard to
detect __VA_OPT__ without this, for example:

  #define THIRD_ARG(a, b, c, ...) c
  #define HAS_VA_OPT(...) THIRD_ARG(__VA_OPT__(,), 1, 0, )
  #if HAS_VA_OPT(?)

Partially reverts 0436ec2128c9775ba13b0308937238fc79673fdd.

(cherry picked from commit 5dfa37a76153f2a18ac7fe30721cc1332b672ea2)
---
 clang/include/clang/Lex/Preprocessor.h        | 19 --------------
 clang/lib/Lex/PPDirectives.cpp                |  5 ----
 clang/lib/Lex/PPExpressions.cpp               |  5 ----
 clang/lib/Lex/PPMacroExpansion.cpp            |  6 +----
 clang/lib/Lex/Preprocessor.cpp                | 15 ++++++-----
 clang/test/Preprocessor/macro_vaopt_check.cpp | 25 +++----------------
 6 files changed, 11 insertions(+), 64 deletions(-)

diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index ba8bdaa23c4c..68139cb24b31 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -447,25 +447,6 @@ class Preprocessor {
           ElseLoc(ElseLoc) {}
   };
 
-  class IfdefMacroNameScopeRAII {
-    Preprocessor &PP;
-    bool VAOPTWasPoisoned;
-
-  public:
-    IfdefMacroNameScopeRAII(Preprocessor &PP)
-        : PP(PP), VAOPTWasPoisoned(PP.Ident__VA_OPT__->isPoisoned()) {
-      PP.Ident__VA_OPT__->setIsPoisoned(false);
-    }
-    IfdefMacroNameScopeRAII(const IfdefMacroNameScopeRAII&) = delete;
-    IfdefMacroNameScopeRAII &operator=(const IfdefMacroNameScopeRAII&) = delete;
-    ~IfdefMacroNameScopeRAII() { Exit(); }
-
-    void Exit() {
-      if (VAOPTWasPoisoned)
-        PP.Ident__VA_OPT__->setIsPoisoned(true);
-    }
-  };
-
 private:
   friend class ASTReader;
   friend class MacroArgs;
diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp
index e2aa93455ea5..d6b03d85913d 100644
--- a/clang/lib/Lex/PPDirectives.cpp
+++ b/clang/lib/Lex/PPDirectives.cpp
@@ -2928,14 +2928,9 @@ void Preprocessor::HandleIfdefDirective(Token &Result,
   ++NumIf;
   Token DirectiveTok = Result;
 
-  // __VA_OPT__ is allowed as the operand of #if[n]def.
-  IfdefMacroNameScopeRAII IfdefMacroNameScope(*this);
-
   Token MacroNameTok;
   ReadMacroName(MacroNameTok);
 
-  IfdefMacroNameScope.Exit();
-
   // Error reading macro name?  If so, diagnostic already issued.
   if (MacroNameTok.is(tok::eod)) {
     // Skip code until we get to #endif.  This helps with recovery by not
diff --git a/clang/lib/Lex/PPExpressions.cpp b/clang/lib/Lex/PPExpressions.cpp
index 952fb8f121dc..8c120c13d7d2 100644
--- a/clang/lib/Lex/PPExpressions.cpp
+++ b/clang/lib/Lex/PPExpressions.cpp
@@ -104,9 +104,6 @@ static bool EvaluateDefined(PPValue &Result, Token &PeekTok, DefinedTracker &DT,
   SourceLocation beginLoc(PeekTok.getLocation());
   Result.setBegin(beginLoc);
 
-  // __VA_OPT__ is allowed as the operand of 'defined'.
-  Preprocessor::IfdefMacroNameScopeRAII IfdefMacroNameScope(PP);
-
   // Get the next token, don't expand it.
   PP.LexUnexpandedNonComment(PeekTok);
 
@@ -125,8 +122,6 @@ static bool EvaluateDefined(PPValue &Result, Token &PeekTok, DefinedTracker &DT,
     PP.LexUnexpandedNonComment(PeekTok);
   }
 
-  IfdefMacroNameScope.Exit();
-
   // If we don't have a pp-identifier now, this is an error.
   if (PP.CheckMacroName(PeekTok, MU_Other))
     return true;
diff --git a/clang/lib/Lex/PPMacroExpansion.cpp b/clang/lib/Lex/PPMacroExpansion.cpp
index f6ca04defeb9..43d31d6c5732 100644
--- a/clang/lib/Lex/PPMacroExpansion.cpp
+++ b/clang/lib/Lex/PPMacroExpansion.cpp
@@ -323,16 +323,13 @@ void Preprocessor::dumpMacroInfo(const IdentifierInfo *II) {
 
 /// RegisterBuiltinMacro - Register the specified identifier in the identifier
 /// table and mark it as a builtin macro to be expanded.
-static IdentifierInfo *RegisterBuiltinMacro(Preprocessor &PP, const char *Name,
-                                            bool Disabled = false) {
+static IdentifierInfo *RegisterBuiltinMacro(Preprocessor &PP, const char *Name){
   // Get the identifier.
   IdentifierInfo *Id = PP.getIdentifierInfo(Name);
 
   // Mark it as being a macro that is builtin.
   MacroInfo *MI = PP.AllocateMacroInfo(SourceLocation());
   MI->setIsBuiltinMacro();
-  if (Disabled)
-    MI->DisableMacro();
   PP.appendDefMacroDirective(Id, MI);
   return Id;
 }
@@ -346,7 +343,6 @@ void Preprocessor::RegisterBuiltinMacros() {
   Ident__TIME__ = RegisterBuiltinMacro(*this, "__TIME__");
   Ident__COUNTER__ = RegisterBuiltinMacro(*this, "__COUNTER__");
   Ident_Pragma  = RegisterBuiltinMacro(*this, "_Pragma");
-  Ident__VA_OPT__ = RegisterBuiltinMacro(*this, "__VA_OPT__", true);
 
   // C++ Standing Document Extensions.
   if (getLangOpts().CPlusPlus)
diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index 9baba204b324..177786d90390 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -115,20 +115,19 @@ Preprocessor::Preprocessor(std::shared_ptr<PreprocessorOptions> PPOpts,
 
   BuiltinInfo = std::make_unique<Builtin::Context>();
 
+  // "Poison" __VA_ARGS__, __VA_OPT__ which can only appear in the expansion of
+  // a macro. They get unpoisoned where it is allowed.
+  (Ident__VA_ARGS__ = getIdentifierInfo("__VA_ARGS__"))->setIsPoisoned();
+  SetPoisonReason(Ident__VA_ARGS__,diag::ext_pp_bad_vaargs_use);
+  (Ident__VA_OPT__ = getIdentifierInfo("__VA_OPT__"))->setIsPoisoned();
+  SetPoisonReason(Ident__VA_OPT__,diag::ext_pp_bad_vaopt_use);
+
   // Initialize the pragma handlers.
   RegisterBuiltinPragmas();
 
   // Initialize builtin macros like __LINE__ and friends.
   RegisterBuiltinMacros();
 
-  // "Poison" __VA_ARGS__, __VA_OPT__ which can only appear in the expansion of
-  // a macro. They get unpoisoned where it is allowed. Note that we model
-  // __VA_OPT__ as a builtin macro to allow #ifdef and friends to detect it.
-  (Ident__VA_ARGS__ = getIdentifierInfo("__VA_ARGS__"))->setIsPoisoned();
-  SetPoisonReason(Ident__VA_ARGS__, diag::ext_pp_bad_vaargs_use);
-  Ident__VA_OPT__->setIsPoisoned();
-  SetPoisonReason(Ident__VA_OPT__, diag::ext_pp_bad_vaopt_use);
-
   if(LangOpts.Borland) {
     Ident__exception_info        = getIdentifierInfo("_exception_info");
     Ident___exception_info       = getIdentifierInfo("__exception_info");
diff --git a/clang/test/Preprocessor/macro_vaopt_check.cpp b/clang/test/Preprocessor/macro_vaopt_check.cpp
index 84f3b85871dd..c5c0ac518bc0 100644
--- a/clang/test/Preprocessor/macro_vaopt_check.cpp
+++ b/clang/test/Preprocessor/macro_vaopt_check.cpp
@@ -2,20 +2,6 @@
 // RUN: %clang_cc1 %s -Eonly -verify -Wno-all -pedantic -std=c++11
 // RUN: %clang_cc1 -x c %s -Eonly -verify -Wno-all -pedantic -std=c99
 
-// Check that support for __VA_OPT__ can be detected by #ifdef.
-#ifndef __VA_OPT__
-#error should be defined
-#endif
-
-#ifdef __VA_OPT__
-#else
-#error should be defined
-#endif
-
-#if !defined(__VA_OPT__)
-#error should be defined
-#endif
-
 //expected-error@+1{{missing '('}}
 #define V1(...) __VA_OPT__  
 #undef V1
@@ -82,12 +68,7 @@
 #if __VA_OPT__ // expected-warning {{__VA_OPT__ can only appear in the expansion of a variadic macro}}
 #endif
 
-#define BAD __VA_OPT__ // expected-warning {{__VA_OPT__ can only appear in the expansion of a variadic macro}}
-
-// Check defined(__VA_OPT__) doesn't leave __VA_OPT__ poisoned.
-#define Z(...) (0 __VA_OPT__(|| 1))
-#if defined(__VA_OPT__) && Z(hello)
-// OK
-#else
-#error bad
+#ifdef __VA_OPT__ // expected-warning {{__VA_OPT__ can only appear in the expansion of a variadic macro}}
 #endif
+
+#define BAD __VA_OPT__ // expected-warning {{__VA_OPT__ can only appear in the expansion of a variadic macro}}

From 9df2b64fc5fa911ca59b3f646806ca3fd6787c2d Mon Sep 17 00:00:00 2001
From: Richard Smith <richard@metafoo.co.uk>
Date: Wed, 27 Jan 2021 16:07:51 -0800
Subject: [PATCH 006/318] [cxx_status] Mark P0732R2 as only 'partial', not
 'Clang 12', as some of the changes were reverted.

(cherry picked from commit 727fc31a9898dfb89610ca1bc05ff86204a77177)
---
 clang/www/cxx_status.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/www/cxx_status.html b/clang/www/cxx_status.html
index 685f32dbe0d3..fc3340ec9d96 100755
--- a/clang/www/cxx_status.html
+++ b/clang/www/cxx_status.html
@@ -1005,7 +1005,7 @@ <h2 id="cxx20">C++20 implementation status</h2>
     <tr>
       <td rowspan="2">Class types as non-type template parameters</td>
       <td><a href="https://wg21.link/p0732r2">P0732R2</a></td>
-      <td rowspan="2" class="unreleased" align="center">Clang 12</td>
+      <td rowspan="2" class="partial" align="center">Partial</td>
     </tr>
       <tr> <!-- from Belfast -->
         <td><a href="https://wg21.link/p1907r1">P1907R1</a></td>

From 8d22f25d155113f9cfdf3952dc49088c820f2a77 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 27 Jan 2021 16:28:04 -0800
Subject: [PATCH 007/318] [llvm-c] Move LLVMX86_AMXTypeKind &
 LLVMPoisonValueValueKind to the bottom to avoid value changes compared with
 LLVM<=11

Fixes PR48905

(cherry picked from commit 6612c2bb68becda5504099b48082c844503c6d4c)
---
 llvm/include/llvm-c/Core.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h
index 8274213aa839..a78df16ca404 100644
--- a/llvm/include/llvm-c/Core.h
+++ b/llvm/include/llvm-c/Core.h
@@ -160,10 +160,10 @@ typedef enum {
   LLVMVectorTypeKind,    /**< Fixed width SIMD vector type */
   LLVMMetadataTypeKind,  /**< Metadata */
   LLVMX86_MMXTypeKind,   /**< X86 MMX */
-  LLVMX86_AMXTypeKind,   /**< X86 AMX */
   LLVMTokenTypeKind,     /**< Tokens */
   LLVMScalableVectorTypeKind, /**< Scalable SIMD vector type */
-  LLVMBFloatTypeKind     /**< 16 bit brain floating point type */
+  LLVMBFloatTypeKind,    /**< 16 bit brain floating point type */
+  LLVMX86_AMXTypeKind    /**< X86 AMX */
 } LLVMTypeKind;
 
 typedef enum {
@@ -270,7 +270,6 @@ typedef enum {
   LLVMConstantVectorValueKind,
 
   LLVMUndefValueValueKind,
-  LLVMPoisonValueValueKind,
   LLVMConstantAggregateZeroValueKind,
   LLVMConstantDataArrayValueKind,
   LLVMConstantDataVectorValueKind,
@@ -283,6 +282,7 @@ typedef enum {
   LLVMInlineAsmValueKind,
 
   LLVMInstructionValueKind,
+  LLVMPoisonValueValueKind
 } LLVMValueKind;
 
 typedef enum {

From 8364f5369eeeb2da8db2bae7716c549930d8df93 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Wed, 27 Jan 2021 10:59:28 -0800
Subject: [PATCH 008/318] Revert "Suppress non-conforming GNU paste extension
 in all standard-conforming modes"

This reverts commit f4537935dcdbf390c863591cf556e76c3abab9c1.
This reverts commit b43c26d036dcbf7a6881f39e4434cf059364022a.

This GNU and MSVC extension turns out to be very popular. Most projects
are not using C++20, so cannot use the new __VA_OPT__ feature to be
standards conformant. The other workaround, using -std=gnu*, enables too
many language extensions and isn't viable.

Until there is a way for users to get the behavior provided by the
`, ## __VA_ARGS__` extension in the -std=c++17 and earlier language
modes, we need to revert this.

(cherry picked from commit 61a66e4b5ec18e9e73c2f6334f6b7f7dd4bca77e)
---
 clang/lib/Lex/TokenLexer.cpp                      | 10 +++++-----
 clang/test/Preprocessor/macro_fn_comma_swallow2.c |  5 -----
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/clang/lib/Lex/TokenLexer.cpp b/clang/lib/Lex/TokenLexer.cpp
index 97cb2cf0bb8c..da5681aaf478 100644
--- a/clang/lib/Lex/TokenLexer.cpp
+++ b/clang/lib/Lex/TokenLexer.cpp
@@ -148,12 +148,12 @@ bool TokenLexer::MaybeRemoveCommaBeforeVaArgs(
     return false;
 
   // GCC removes the comma in the expansion of " ... , ## __VA_ARGS__ " if
-  // __VA_ARGS__ is empty, but not in strict mode where there are no
-  // named arguments, where it remains.  With GNU extensions, it is removed
-  // regardless of named arguments.
+  // __VA_ARGS__ is empty, but not in strict C99 mode where there are no
+  // named arguments, where it remains.  In all other modes, including C99
+  // with GNU extensions, it is removed regardless of named arguments.
   // Microsoft also appears to support this extension, unofficially.
-  if (!PP.getLangOpts().GNUMode && !PP.getLangOpts().MSVCCompat &&
-      Macro->getNumParams() < 2)
+  if (PP.getLangOpts().C99 && !PP.getLangOpts().GNUMode
+        && Macro->getNumParams() < 2)
     return false;
 
   // Is a comma available to be removed?
diff --git a/clang/test/Preprocessor/macro_fn_comma_swallow2.c b/clang/test/Preprocessor/macro_fn_comma_swallow2.c
index 4e4960ca7f18..93ab2b83664a 100644
--- a/clang/test/Preprocessor/macro_fn_comma_swallow2.c
+++ b/clang/test/Preprocessor/macro_fn_comma_swallow2.c
@@ -1,16 +1,11 @@
 // Test the __VA_ARGS__ comma swallowing extensions of various compiler dialects.
 
 // RUN: %clang_cc1 -E %s | FileCheck -check-prefix=GCC -strict-whitespace %s
-// RUN: %clang_cc1 -E -std=c90 %s | FileCheck -check-prefix=C99 -strict-whitespace %s
 // RUN: %clang_cc1 -E -std=c99 %s | FileCheck -check-prefix=C99 -strict-whitespace %s
 // RUN: %clang_cc1 -E -std=c11 %s | FileCheck -check-prefix=C99 -strict-whitespace %s
 // RUN: %clang_cc1 -E -x c++ %s | FileCheck -check-prefix=GCC -strict-whitespace %s
-// RUN: %clang_cc1 -E -x c++ -std=c++03 %s | FileCheck -check-prefix=C99 -strict-whitespace %s
-// RUN: %clang_cc1 -E -x c++ -std=c++11 %s | FileCheck -check-prefix=C99 -strict-whitespace %s
 // RUN: %clang_cc1 -E -std=gnu99 %s | FileCheck -check-prefix=GCC -strict-whitespace %s
 // RUN: %clang_cc1 -E -fms-compatibility %s | FileCheck -check-prefix=MS -strict-whitespace %s
-// RUN: %clang_cc1 -E -x c++ -fms-compatibility %s | FileCheck -check-prefix=MS -strict-whitespace %s
-// RUN: %clang_cc1 -E -x c++ -std=c++11 -fms-compatibility %s | FileCheck -check-prefix=MS -strict-whitespace %s
 // RUN: %clang_cc1 -E -DNAMED %s | FileCheck -check-prefix=GCC -strict-whitespace %s
 // RUN: %clang_cc1 -E -std=c99 -DNAMED %s | FileCheck -check-prefix=C99 -strict-whitespace %s
 

From b0085d205b3063c332a080599830ef0500cb6924 Mon Sep 17 00:00:00 2001
From: James Y Knight <jyknight@google.com>
Date: Mon, 7 Dec 2020 10:26:49 -0500
Subject: [PATCH 009/318] Itanium Mangling: Mangle `__alignof__` differently
 than `alignof`.

The two operations have acted differently since Clang 8, but were
unfortunately mangled the same. The new mangling uses new "vendor
extended expression" syntax proposed in
https://github.com/itanium-cxx-abi/cxx-abi/issues/112

GCC had the same mangling problem, https://gcc.gnu.org/PR88115, and
will hopefully be switching to the same mangling as implemented here.

Additionally, fix the mangling of `__uuidof` to use the new extension
syntax, instead of its previous nonstandard special-case.

Adjusts the demangler accordingly.

Differential Revision: https://reviews.llvm.org/D93922

(cherry picked from commit 9c7aeaebb3ac1b94200b59b111742cb6b8f090c2)
---
 clang/lib/AST/ItaniumMangle.cpp               | 103 ++++++++++++------
 clang/test/CodeGenCXX/mangle-alignof.cpp      |  25 +++++
 .../CodeGenCXX/microsoft-uuidof-mangling.cpp  |  44 +++++---
 libcxxabi/src/demangle/ItaniumDemangle.h      |  68 ++++++------
 libcxxabi/test/test_demangle.pass.cpp         |  14 ++-
 llvm/include/llvm/Demangle/ItaniumDemangle.h  |  68 ++++++------
 6 files changed, 211 insertions(+), 111 deletions(-)
 create mode 100644 clang/test/CodeGenCXX/mangle-alignof.cpp

diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 6c8d5687c64a..668733a4be34 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -558,6 +558,7 @@ class CXXNameMangler {
                           unsigned NumTemplateArgs);
   void mangleTemplateArgs(TemplateName TN, const TemplateArgumentList &AL);
   void mangleTemplateArg(TemplateArgument A, bool NeedExactType);
+  void mangleTemplateArgExpr(const Expr *E);
   void mangleValueInTemplateArg(QualType T, const APValue &V, bool TopLevel,
                                 bool NeedExactType = false);
 
@@ -3528,8 +3529,8 @@ void CXXNameMangler::mangleType(const DependentSizedMatrixType *T) {
   Out << "u" << VendorQualifier.size() << VendorQualifier;
 
   Out << "I";
-  mangleTemplateArg(T->getRowExpr(), false);
-  mangleTemplateArg(T->getColumnExpr(), false);
+  mangleTemplateArgExpr(T->getRowExpr());
+  mangleTemplateArgExpr(T->getColumnExpr());
   mangleType(T->getElementType());
   Out << "E";
 }
@@ -3916,6 +3917,7 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
   //              ::= ds <expression> <expression>                   # expr.*expr
   //              ::= sZ <template-param>                            # size of a parameter pack
   //              ::= sZ <function-param>    # size of a function parameter pack
+  //              ::= u <source-name> <template-arg>* E # vendor extended expression
   //              ::= <expr-primary>
   // <expr-primary> ::= L <type> <value number> E    # integer literal
   //                ::= L <type <value float> E      # floating literal
@@ -4007,14 +4009,26 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
 
   case Expr::CXXUuidofExprClass: {
     const CXXUuidofExpr *UE = cast<CXXUuidofExpr>(E);
-    if (UE->isTypeOperand()) {
-      QualType UuidT = UE->getTypeOperand(Context.getASTContext());
-      Out << "u8__uuidoft";
-      mangleType(UuidT);
+    // As of clang 12, uuidof uses the vendor extended expression
+    // mangling. Previously, it used a special-cased nonstandard extension.
+    if (Context.getASTContext().getLangOpts().getClangABICompat() >
+        LangOptions::ClangABI::Ver11) {
+      Out << "u8__uuidof";
+      if (UE->isTypeOperand())
+        mangleType(UE->getTypeOperand(Context.getASTContext()));
+      else
+        mangleTemplateArgExpr(UE->getExprOperand());
+      Out << 'E';
     } else {
-      Expr *UuidExp = UE->getExprOperand();
-      Out << "u8__uuidofz";
-      mangleExpression(UuidExp, Arity);
+      if (UE->isTypeOperand()) {
+        QualType UuidT = UE->getTypeOperand(Context.getASTContext());
+        Out << "u8__uuidoft";
+        mangleType(UuidT);
+      } else {
+        Expr *UuidExp = UE->getExprOperand();
+        Out << "u8__uuidofz";
+        mangleExpression(UuidExp, Arity);
+      }
     }
     break;
   }
@@ -4312,13 +4326,39 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
       break;
     }
 
+    auto MangleAlignofSizeofArg = [&] {
+      if (SAE->isArgumentType()) {
+        Out << 't';
+        mangleType(SAE->getArgumentType());
+      } else {
+        Out << 'z';
+        mangleExpression(SAE->getArgumentExpr());
+      }
+    };
+
     switch(SAE->getKind()) {
     case UETT_SizeOf:
       Out << 's';
+      MangleAlignofSizeofArg();
       break;
     case UETT_PreferredAlignOf:
+      // As of clang 12, we mangle __alignof__ differently than alignof. (They
+      // have acted differently since Clang 8, but were previously mangled the
+      // same.)
+      if (Context.getASTContext().getLangOpts().getClangABICompat() >
+          LangOptions::ClangABI::Ver11) {
+        Out << "u11__alignof__";
+        if (SAE->isArgumentType())
+          mangleType(SAE->getArgumentType());
+        else
+          mangleTemplateArgExpr(SAE->getArgumentExpr());
+        Out << 'E';
+        break;
+      }
+      LLVM_FALLTHROUGH;
     case UETT_AlignOf:
       Out << 'a';
+      MangleAlignofSizeofArg();
       break;
     case UETT_VecStep: {
       DiagnosticsEngine &Diags = Context.getDiags();
@@ -4336,13 +4376,6 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
       return;
     }
     }
-    if (SAE->isArgumentType()) {
-      Out << 't';
-      mangleType(SAE->getArgumentType());
-    } else {
-      Out << 'z';
-      mangleExpression(SAE->getArgumentExpr());
-    }
     break;
   }
 
@@ -4971,23 +5004,7 @@ void CXXNameMangler::mangleTemplateArg(TemplateArgument A, bool NeedExactType) {
     mangleType(A.getAsTemplateOrTemplatePattern());
     break;
   case TemplateArgument::Expression: {
-    // It's possible to end up with a DeclRefExpr here in certain
-    // dependent cases, in which case we should mangle as a
-    // declaration.
-    const Expr *E = A.getAsExpr()->IgnoreParenImpCasts();
-    if (const DeclRefExpr *DRE = dyn_cast<DeclRefExpr>(E)) {
-      const ValueDecl *D = DRE->getDecl();
-      if (isa<VarDecl>(D) || isa<FunctionDecl>(D)) {
-        Out << 'L';
-        mangle(D);
-        Out << 'E';
-        break;
-      }
-    }
-
-    Out << 'X';
-    mangleExpression(E);
-    Out << 'E';
+    mangleTemplateArgExpr(A.getAsExpr());
     break;
   }
   case TemplateArgument::Integral:
@@ -5044,6 +5061,26 @@ void CXXNameMangler::mangleTemplateArg(TemplateArgument A, bool NeedExactType) {
   }
 }
 
+void CXXNameMangler::mangleTemplateArgExpr(const Expr *E) {
+  // It's possible to end up with a DeclRefExpr here in certain
+  // dependent cases, in which case we should mangle as a
+  // declaration.
+  E = E->IgnoreParenImpCasts();
+  if (const DeclRefExpr *DRE = dyn_cast<DeclRefExpr>(E)) {
+    const ValueDecl *D = DRE->getDecl();
+    if (isa<VarDecl>(D) || isa<FunctionDecl>(D)) {
+      Out << 'L';
+      mangle(D);
+      Out << 'E';
+      return;
+    }
+  }
+
+  Out << 'X';
+  mangleExpression(E);
+  Out << 'E';
+}
+
 /// Determine whether a given value is equivalent to zero-initialization for
 /// the purpose of discarding a trailing portion of a 'tl' mangling.
 ///
diff --git a/clang/test/CodeGenCXX/mangle-alignof.cpp b/clang/test/CodeGenCXX/mangle-alignof.cpp
new file mode 100644
index 000000000000..0a65c7e87a2d
--- /dev/null
+++ b/clang/test/CodeGenCXX/mangle-alignof.cpp
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 -std=c++11 -Wno-gnu-alignof-expression -emit-llvm %s -o - -triple=%itanium_abi_triple | FileCheck %s --check-prefix=CHECK-NEW
+// RUN: %clang_cc1 -std=c++11 -Wno-gnu-alignof-expression -emit-llvm %s -o - -triple=%itanium_abi_triple -fclang-abi-compat=11 | FileCheck %s --check-prefix=CHECK-OLD
+
+// Verify the difference in mangling for alignof and __alignof__ in a new ABI
+// compat mode.
+
+template <class T> void f1(decltype(alignof(T))) {}
+template void f1<int>(__SIZE_TYPE__);
+// CHECK-OLD: void @_Z2f1IiEvDTatT_E
+// CHECK-NEW: void @_Z2f1IiEvDTatT_E
+
+template <class T> void f2(decltype(__alignof__(T))) {}
+template void f2<int>(__SIZE_TYPE__);
+// CHECK-OLD: void @_Z2f2IiEvDTatT_E
+// CHECK-NEW: void @_Z2f2IiEvDTu11__alignof__T_E
+
+template <class T> void f3(decltype(alignof(T(0)))) {}
+template void f3<int>(__SIZE_TYPE__);
+// CHECK-OLD: void @_Z2f3IiEvDTazcvT_Li0EE
+// CHECK-NEW: void @_Z2f3IiEvDTazcvT_Li0EE
+
+template <class T> void f4(decltype(__alignof__(T(0)))) {}
+template void f4<int>(__SIZE_TYPE__);
+// CHECK-OLD: void @_Z2f4IiEvDTazcvT_Li0EE
+// CHECK-NEW: void @_Z2f4IiEvDTu11__alignof__XcvT_Li0EEEE
diff --git a/clang/test/CodeGenCXX/microsoft-uuidof-mangling.cpp b/clang/test/CodeGenCXX/microsoft-uuidof-mangling.cpp
index ec26be292acc..321f65cacc71 100644
--- a/clang/test/CodeGenCXX/microsoft-uuidof-mangling.cpp
+++ b/clang/test/CodeGenCXX/microsoft-uuidof-mangling.cpp
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -emit-llvm %s -o - -triple x86_64-unknown-unknown -fms-extensions | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple x86_64-unknown-unknown -fms-extensions | FileCheck %s --check-prefixes=CHECK,CHECK-V12
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple x86_64-unknown-unknown -fms-extensions -fclang-abi-compat=11 | FileCheck %s --check-prefixes=CHECK,CHECK-V11
 // rdar://17784718
 
 typedef struct _GUID
@@ -24,11 +25,16 @@ struct __declspec(uuid("EAFA1952-66F8-438B-8FBA-AF1BBAE42191")) TestStruct
 
 struct __declspec(uuid("EAFA1952-66F8-438B-8FBA-AF1BBAE42191")) OtherStruct {};
 
-template <class T> void test_uuidofType(void *arg[sizeof(__uuidof(T))] = 0) {}
+template <class T> void test_uuidofType(decltype(__uuidof(T)) arg) {}
 
-template <class T> void test_uuidofExpr(void *arg[sizeof(__uuidof(typename T::member))] = 0) {}
+template <class T> void test_uuidofExpr(decltype(__uuidof(T::member)) arg) {}
 
-struct HasMember { typedef TestStruct member; };
+struct HasMember {
+  TestStruct member;
+};
+
+// Ensure that mangling an "expr-primary" argument is handled properly.
+template <class T> void test_uuidofExpr2(decltype(T{}, __uuidof(HasMember::member)) arg) {}
 
 template<const GUID&> struct UUIDTestTwo { UUIDTestTwo(); };
 
@@ -39,19 +45,29 @@ int main(int argc, const char * argv[])
     // type had better not mention TestStruct or OtherStruct!
     UUIDTestTwo<__uuidof(TestStruct)> uuidof_test2;
     UUIDTestTwo<__uuidof(OtherStruct)> uuidof_test3;
-    test_uuidofType<TestStruct>();
-    test_uuidofExpr<HasMember>();
+    test_uuidofType<TestStruct>(GUID{});
+    test_uuidofExpr<HasMember>(GUID{});
+    test_uuidofExpr2<TestStruct>(GUID{});
     return 0;
 }
 
 // CHECK: define{{.*}} i32 @main
-// CHECK: call void @_ZN8UUIDTestI10TestStructL_Z42_GUID_eafa1952_66f8_438b_8fba_af1bbae42191EEC1Ev
-// CHECK: call void @_ZN11UUIDTestTwoIL_Z42_GUID_eafa1952_66f8_438b_8fba_af1bbae42191EEC1Ev
-// CHECK: call void @_ZN11UUIDTestTwoIL_Z42_GUID_eafa1952_66f8_438b_8fba_af1bbae42191EEC1Ev
-// CHECK: call void @_Z15test_uuidofTypeI10TestStructEvPPv(i8** null)
-// CHECK: call void @_Z15test_uuidofExprI9HasMemberEvPPv(i8** null)
-
+// CHECK: call void @_ZN8UUIDTestI10TestStructL_Z42_GUID_eafa1952_66f8_438b_8fba_af1bbae42191EEC1Ev(
+// CHECK: call void @_ZN11UUIDTestTwoIL_Z42_GUID_eafa1952_66f8_438b_8fba_af1bbae42191EEC1Ev(
+// CHECK: call void @_ZN11UUIDTestTwoIL_Z42_GUID_eafa1952_66f8_438b_8fba_af1bbae42191EEC1Ev(
+// CHECK-V11: call void @_Z15test_uuidofTypeI10TestStructEvDTu8__uuidoftT_E(
+// CHECK-V12: call void @_Z15test_uuidofTypeI10TestStructEvDTu8__uuidofT_EE(
+// CHECK-V11: call void @_Z15test_uuidofExprI9HasMemberEvDTu8__uuidofzsrT_6memberE(
+// CHECK-V12: call void @_Z15test_uuidofExprI9HasMemberEvDTu8__uuidofXsrT_6memberEEE(
+// CHECK-V11: call void @_Z16test_uuidofExpr2I10TestStructEvDTcmtlT_Eu8__uuidofzL_ZN9HasMember6memberEEE(
+// CHECK-V12: call void @_Z16test_uuidofExpr2I10TestStructEvDTcmtlT_Eu8__uuidofXL_ZN9HasMember6memberEEEEE(
+//    TODO: the above mangling is wrong -- the X/E shouldn't be emitted:       ^                     ^
 // CHECK: define linkonce_odr void @_ZN8UUIDTestI10TestStructL_Z42_GUID_eafa1952_66f8_438b_8fba_af1bbae42191EEC1Ev
-// CHECK: define linkonce_odr void @_Z15test_uuidofTypeI10TestStructEvPPv
-// CHECK: define linkonce_odr void @_Z15test_uuidofExprI9HasMemberEvPPv
+// CHECK-V11: define linkonce_odr void @_Z15test_uuidofTypeI10TestStructEvDTu8__uuidoftT_E(
+// CHECK-V12: define linkonce_odr void @_Z15test_uuidofTypeI10TestStructEvDTu8__uuidofT_EE(
+// CHECK-V11: define linkonce_odr void @_Z15test_uuidofExprI9HasMemberEvDTu8__uuidofzsrT_6memberE(
+// CHECK-V12: define linkonce_odr void @_Z15test_uuidofExprI9HasMemberEvDTu8__uuidofXsrT_6memberEEE(
+// CHECK-V11: define linkonce_odr void @_Z16test_uuidofExpr2I10TestStructEvDTcmtlT_Eu8__uuidofzL_ZN9HasMember6memberEEE(
+// CHECK-V12: define linkonce_odr void @_Z16test_uuidofExpr2I10TestStructEvDTcmtlT_Eu8__uuidofXL_ZN9HasMember6memberEEEEE(
+//    TODO: the above mangling is wrong -- the X/E shouldn't be emitted:                      ^                     ^
 // CHECK: define linkonce_odr void @_ZN8UUIDTestI10TestStructL_Z42_GUID_eafa1952_66f8_438b_8fba_af1bbae42191EEC2Ev
diff --git a/libcxxabi/src/demangle/ItaniumDemangle.h b/libcxxabi/src/demangle/ItaniumDemangle.h
index 6bfc02d15379..e5fca98f9271 100644
--- a/libcxxabi/src/demangle/ItaniumDemangle.h
+++ b/libcxxabi/src/demangle/ItaniumDemangle.h
@@ -96,7 +96,6 @@
     X(InitListExpr) \
     X(FoldExpr) \
     X(ThrowExpr) \
-    X(UUIDOfExpr) \
     X(BoolExpr) \
     X(StringLiteral) \
     X(LambdaExpr) \
@@ -2035,21 +2034,6 @@ class ThrowExpr : public Node {
   }
 };
 
-// MSVC __uuidof extension, generated by clang in -fms-extensions mode.
-class UUIDOfExpr : public Node {
-  Node *Operand;
-public:
-  UUIDOfExpr(Node *Operand_) : Node(KUUIDOfExpr), Operand(Operand_) {}
-
-  template<typename Fn> void match(Fn F) const { F(Operand); }
-
-  void printLeft(OutputStream &S) const override {
-    S << "__uuidof(";
-    Operand->print(S);
-    S << ")";
-  }
-};
-
 class BoolExpr : public Node {
   bool Value;
 
@@ -5013,6 +4997,43 @@ Node *AbstractManglingParser<Derived, Alloc>::parseExpr() {
     }
     }
     return nullptr;
+  case 'u': {
+    ++First;
+    Node *Name = getDerived().parseSourceName(/*NameState=*/nullptr);
+    if (!Name)
+      return nullptr;
+    // Special case legacy __uuidof mangling. The 't' and 'z' appear where the
+    // standard encoding expects a <template-arg>, and would be otherwise be
+    // interpreted as <type> node 'short' or 'ellipsis'. However, neither
+    // __uuidof(short) nor __uuidof(...) can actually appear, so there is no
+    // actual conflict here.
+    if (Name->getBaseName() == "__uuidof") {
+      if (numLeft() < 2)
+        return nullptr;
+      if (*First == 't') {
+        ++First;
+        Node *Ty = getDerived().parseType();
+        if (!Ty)
+          return nullptr;
+        return make<CallExpr>(Name, makeNodeArray(&Ty, &Ty + 1));
+      }
+      if (*First == 'z') {
+        ++First;
+        Node *Ex = getDerived().parseExpr();
+        if (!Ex)
+          return nullptr;
+        return make<CallExpr>(Name, makeNodeArray(&Ex, &Ex + 1));
+      }
+    }
+    size_t ExprsBegin = Names.size();
+    while (!consumeIf('E')) {
+      Node *E = getDerived().parseTemplateArg();
+      if (E == nullptr)
+        return E;
+      Names.push_back(E);
+    }
+    return make<CallExpr>(Name, popTrailingNodeArray(ExprsBegin));
+  }
   case '1':
   case '2':
   case '3':
@@ -5024,21 +5045,6 @@ Node *AbstractManglingParser<Derived, Alloc>::parseExpr() {
   case '9':
     return getDerived().parseUnresolvedName();
   }
-
-  if (consumeIf("u8__uuidoft")) {
-    Node *Ty = getDerived().parseType();
-    if (!Ty)
-      return nullptr;
-    return make<UUIDOfExpr>(Ty);
-  }
-
-  if (consumeIf("u8__uuidofz")) {
-    Node *Ex = getDerived().parseExpr();
-    if (!Ex)
-      return nullptr;
-    return make<UUIDOfExpr>(Ex);
-  }
-
   return nullptr;
 }
 
diff --git a/libcxxabi/test/test_demangle.pass.cpp b/libcxxabi/test/test_demangle.pass.cpp
index 3954fdba048e..512cc3928fdd 100644
--- a/libcxxabi/test/test_demangle.pass.cpp
+++ b/libcxxabi/test/test_demangle.pass.cpp
@@ -29776,8 +29776,18 @@ const char* cases[][2] =
     // Vendor extension types are substitution candidates.
     {"_Z1fu3fooS_", "f(foo, foo)"},
 
-    {"_ZN3FooIXu8__uuidofzdeL_Z3sucEEEC1Ev", "Foo<__uuidof(*(suc))>::Foo()"},
-    {"_ZN3FooIXu8__uuidoft13SomeUUIDClassEEC1Ev", "Foo<__uuidof(SomeUUIDClass)>::Foo()"},
+    // alignof with type and expression, and __alignof__ with the same.
+    {"_Z2f1IiEvDTatT_E", "void f1<int>(decltype(alignof (int)))"},
+    {"_Z2f3IiEvDTazcvT_Li0EE", "void f3<int>(decltype(alignof ((int)(0))))"},
+    {"_Z2f2IiEvDTu11__alignof__T_EE", "void f2<int>(decltype(__alignof__(int)))"},
+    {"_Z2f4IiEvDTu11__alignof__XcvT_Li0EEEE", "void f4<int>(decltype(__alignof__((int)(0))))"},
+
+    // Legacy nonstandard mangling for __uuidof.
+    {"_Z15test_uuidofTypeI10TestStructEvDTu8__uuidoftT_E", "void test_uuidofType<TestStruct>(decltype(__uuidof(TestStruct)))"},
+    {"_Z15test_uuidofExprI9HasMemberEvDTu8__uuidofXsrT_6memberEEE", "void test_uuidofExpr<HasMember>(decltype(__uuidof(HasMember::member)))"},
+    // Current __uuidof mangling using vendor extended expression.
+    {"_Z15test_uuidofTypeI10TestStructEvDTu8__uuidofT_EE", "void test_uuidofType<TestStruct>(decltype(__uuidof(TestStruct)))"},
+    {"_Z15test_uuidofExprI9HasMemberEvDTu8__uuidofXsrT_6memberEEE", "void test_uuidofExpr<HasMember>(decltype(__uuidof(HasMember::member)))"},
 
     // C++2a char8_t:
     {"_ZTSPDu", "typeinfo name for char8_t*"},
diff --git a/llvm/include/llvm/Demangle/ItaniumDemangle.h b/llvm/include/llvm/Demangle/ItaniumDemangle.h
index 6bfc02d15379..e5fca98f9271 100644
--- a/llvm/include/llvm/Demangle/ItaniumDemangle.h
+++ b/llvm/include/llvm/Demangle/ItaniumDemangle.h
@@ -96,7 +96,6 @@
     X(InitListExpr) \
     X(FoldExpr) \
     X(ThrowExpr) \
-    X(UUIDOfExpr) \
     X(BoolExpr) \
     X(StringLiteral) \
     X(LambdaExpr) \
@@ -2035,21 +2034,6 @@ class ThrowExpr : public Node {
   }
 };
 
-// MSVC __uuidof extension, generated by clang in -fms-extensions mode.
-class UUIDOfExpr : public Node {
-  Node *Operand;
-public:
-  UUIDOfExpr(Node *Operand_) : Node(KUUIDOfExpr), Operand(Operand_) {}
-
-  template<typename Fn> void match(Fn F) const { F(Operand); }
-
-  void printLeft(OutputStream &S) const override {
-    S << "__uuidof(";
-    Operand->print(S);
-    S << ")";
-  }
-};
-
 class BoolExpr : public Node {
   bool Value;
 
@@ -5013,6 +4997,43 @@ Node *AbstractManglingParser<Derived, Alloc>::parseExpr() {
     }
     }
     return nullptr;
+  case 'u': {
+    ++First;
+    Node *Name = getDerived().parseSourceName(/*NameState=*/nullptr);
+    if (!Name)
+      return nullptr;
+    // Special case legacy __uuidof mangling. The 't' and 'z' appear where the
+    // standard encoding expects a <template-arg>, and would be otherwise be
+    // interpreted as <type> node 'short' or 'ellipsis'. However, neither
+    // __uuidof(short) nor __uuidof(...) can actually appear, so there is no
+    // actual conflict here.
+    if (Name->getBaseName() == "__uuidof") {
+      if (numLeft() < 2)
+        return nullptr;
+      if (*First == 't') {
+        ++First;
+        Node *Ty = getDerived().parseType();
+        if (!Ty)
+          return nullptr;
+        return make<CallExpr>(Name, makeNodeArray(&Ty, &Ty + 1));
+      }
+      if (*First == 'z') {
+        ++First;
+        Node *Ex = getDerived().parseExpr();
+        if (!Ex)
+          return nullptr;
+        return make<CallExpr>(Name, makeNodeArray(&Ex, &Ex + 1));
+      }
+    }
+    size_t ExprsBegin = Names.size();
+    while (!consumeIf('E')) {
+      Node *E = getDerived().parseTemplateArg();
+      if (E == nullptr)
+        return E;
+      Names.push_back(E);
+    }
+    return make<CallExpr>(Name, popTrailingNodeArray(ExprsBegin));
+  }
   case '1':
   case '2':
   case '3':
@@ -5024,21 +5045,6 @@ Node *AbstractManglingParser<Derived, Alloc>::parseExpr() {
   case '9':
     return getDerived().parseUnresolvedName();
   }
-
-  if (consumeIf("u8__uuidoft")) {
-    Node *Ty = getDerived().parseType();
-    if (!Ty)
-      return nullptr;
-    return make<UUIDOfExpr>(Ty);
-  }
-
-  if (consumeIf("u8__uuidofz")) {
-    Node *Ex = getDerived().parseExpr();
-    if (!Ex)
-      return nullptr;
-    return make<UUIDOfExpr>(Ex);
-  }
-
   return nullptr;
 }
 

From 7da92afbf08e90960f7e5dee00bbf6ef8f323a5c Mon Sep 17 00:00:00 2001
From: James Y Knight <jyknight@google.com>
Date: Sun, 24 Jan 2021 15:50:15 -0500
Subject: [PATCH 010/318] Itanium Mangling: Fix handling of <expr-primary> in
 <template-arg>.

Previously, we were emitting an extraneous X .. E in <template-arg>
around an <expr-primary> if the template argument was constructed from
an expression (rather than an already-evaluated literal value).  In
such a case, we would then e.g. emit 'XLi0EE' instead of 'Li0E'.

We had one special-case for DeclRefExpr expressions, in particular, to
omit them the mangled-name without the surrounding X/E. However,
unfortunately, that special case also triggered for ParmVarDecl (a
subtype of VarDecl), and _incorrectly_ emitted 'L_Z .. E' instead of
the proper 'Xfp_E'.

This change causes mangleExpression itself to be responsible for
emitting X/E around non-primary expressions, which removes the
special-case, and corrects both these problems.

Differential Revision: https://reviews.llvm.org/D95487

(cherry picked from commit 8ca33605ff0cfc536f5c6710fb5f6378bf11959a)
---
 clang/lib/AST/ItaniumMangle.cpp               | 223 +++++++++++++-----
 clang/test/CodeGenCXX/clang-abi-compat.cpp    |  94 +++++++-
 clang/test/CodeGenCXX/mangle-abi-tag.cpp      |   2 +-
 clang/test/CodeGenCXX/mangle-concept.cpp      |   4 +-
 clang/test/CodeGenCXX/mangle-template.cpp     |   4 +-
 clang/test/CodeGenCXX/mangle.cpp              |   2 +-
 clang/test/CodeGenCXX/matrix-type.cpp         |  16 +-
 .../CodeGenCXX/microsoft-uuidof-mangling.cpp  |   6 +-
 8 files changed, 259 insertions(+), 92 deletions(-)

diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 668733a4be34..54e2f361a9f1 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -546,8 +546,8 @@ class CXXNameMangler {
                         unsigned knownArity);
   void mangleCastExpression(const Expr *E, StringRef CastEncoding);
   void mangleInitListElements(const InitListExpr *InitList);
-  void mangleDeclRefExpr(const NamedDecl *D);
-  void mangleExpression(const Expr *E, unsigned Arity = UnknownArity);
+  void mangleExpression(const Expr *E, unsigned Arity = UnknownArity,
+                        bool AsTemplateArg = false);
   void mangleCXXCtorType(CXXCtorType T, const CXXRecordDecl *InheritedFrom);
   void mangleCXXDtorType(CXXDtorType T);
 
@@ -3872,33 +3872,8 @@ void CXXNameMangler::mangleInitListElements(const InitListExpr *InitList) {
     mangleExpression(InitList->getInit(i));
 }
 
-void CXXNameMangler::mangleDeclRefExpr(const NamedDecl *D) {
-  switch (D->getKind()) {
-  default:
-    //  <expr-primary> ::= L <mangled-name> E # external name
-    Out << 'L';
-    mangle(D);
-    Out << 'E';
-    break;
-
-  case Decl::ParmVar:
-    mangleFunctionParam(cast<ParmVarDecl>(D));
-    break;
-
-  case Decl::EnumConstant: {
-    const EnumConstantDecl *ED = cast<EnumConstantDecl>(D);
-    mangleIntegerLiteral(ED->getType(), ED->getInitVal());
-    break;
-  }
-
-  case Decl::NonTypeTemplateParm:
-    const NonTypeTemplateParmDecl *PD = cast<NonTypeTemplateParmDecl>(D);
-    mangleTemplateParameter(PD->getDepth(), PD->getIndex());
-    break;
-  }
-}
-
-void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
+void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity,
+                                      bool AsTemplateArg) {
   // <expression> ::= <unary operator-name> <expression>
   //              ::= <binary operator-name> <expression> <expression>
   //              ::= <trinary operator-name> <expression> <expression> <expression>
@@ -3912,6 +3887,7 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
   //              ::= at <type>                      # alignof (a type)
   //              ::= <template-param>
   //              ::= <function-param>
+  //              ::= fpT                            # 'this' expression (part of <function-param>)
   //              ::= sr <type> <unqualified-name>                   # dependent name
   //              ::= sr <type> <unqualified-name> <template-args>   # dependent template-id
   //              ::= ds <expression> <expression>                   # expr.*expr
@@ -3920,11 +3896,55 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
   //              ::= u <source-name> <template-arg>* E # vendor extended expression
   //              ::= <expr-primary>
   // <expr-primary> ::= L <type> <value number> E    # integer literal
-  //                ::= L <type <value float> E      # floating literal
+  //                ::= L <type> <value float> E     # floating literal
+  //                ::= L <type> <string type> E     # string literal
+  //                ::= L <nullptr type> E           # nullptr literal "LDnE"
+  //                ::= L <pointer type> 0 E         # null pointer template argument
+  //                ::= L <type> <real-part float> _ <imag-part float> E    # complex floating point literal (C99); not used by clang
   //                ::= L <mangled-name> E           # external name
-  //                ::= fpT                          # 'this' expression
   QualType ImplicitlyConvertedToType;
 
+  // A top-level expression that's not <expr-primary> needs to be wrapped in
+  // X...E in a template arg.
+  bool IsPrimaryExpr = true;
+  auto NotPrimaryExpr = [&] {
+    if (AsTemplateArg && IsPrimaryExpr)
+      Out << 'X';
+    IsPrimaryExpr = false;
+  };
+
+  auto MangleDeclRefExpr = [&](const NamedDecl *D) {
+    switch (D->getKind()) {
+    default:
+      //  <expr-primary> ::= L <mangled-name> E # external name
+      Out << 'L';
+      mangle(D);
+      Out << 'E';
+      break;
+
+    case Decl::ParmVar:
+      NotPrimaryExpr();
+      mangleFunctionParam(cast<ParmVarDecl>(D));
+      break;
+
+    case Decl::EnumConstant: {
+      // <expr-primary>
+      const EnumConstantDecl *ED = cast<EnumConstantDecl>(D);
+      mangleIntegerLiteral(ED->getType(), ED->getInitVal());
+      break;
+    }
+
+    case Decl::NonTypeTemplateParm:
+      NotPrimaryExpr();
+      const NonTypeTemplateParmDecl *PD = cast<NonTypeTemplateParmDecl>(D);
+      mangleTemplateParameter(PD->getDepth(), PD->getIndex());
+      break;
+    }
+  };
+
+  // 'goto recurse' is used when handling a simple "unwrapping" node which
+  // produces no output, where ImplicitlyConvertedToType and AsTemplateArg need
+  // to be preserved.
 recurse:
   switch (E->getStmtClass()) {
   case Expr::NoStmtClass:
@@ -3996,6 +4016,7 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
   case Expr::SourceLocExprClass:
   case Expr::BuiltinBitCastExprClass:
   {
+    NotPrimaryExpr();
     if (!NullOut) {
       // As bad as this diagnostic is, it's better than crashing.
       DiagnosticsEngine &Diags = Context.getDiags();
@@ -4003,11 +4024,13 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
                                        "cannot yet mangle expression type %0");
       Diags.Report(E->getExprLoc(), DiagID)
         << E->getStmtClassName() << E->getSourceRange();
+      return;
     }
     break;
   }
 
   case Expr::CXXUuidofExprClass: {
+    NotPrimaryExpr();
     const CXXUuidofExpr *UE = cast<CXXUuidofExpr>(E);
     // As of clang 12, uuidof uses the vendor extended expression
     // mangling. Previously, it used a special-cased nonstandard extension.
@@ -4027,7 +4050,7 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
       } else {
         Expr *UuidExp = UE->getExprOperand();
         Out << "u8__uuidofz";
-        mangleExpression(UuidExp, Arity);
+        mangleExpression(UuidExp);
       }
     }
     break;
@@ -4035,13 +4058,14 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
 
   // Even gcc-4.5 doesn't mangle this.
   case Expr::BinaryConditionalOperatorClass: {
+    NotPrimaryExpr();
     DiagnosticsEngine &Diags = Context.getDiags();
     unsigned DiagID =
       Diags.getCustomDiagID(DiagnosticsEngine::Error,
                 "?: operator with omitted middle operand cannot be mangled");
     Diags.Report(E->getExprLoc(), DiagID)
       << E->getStmtClassName() << E->getSourceRange();
-    break;
+    return;
   }
 
   // These are used for internal purposes and cannot be meaningfully mangled.
@@ -4049,6 +4073,7 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
     llvm_unreachable("cannot mangle opaque value; mangling wrong thing?");
 
   case Expr::InitListExprClass: {
+    NotPrimaryExpr();
     Out << "il";
     mangleInitListElements(cast<InitListExpr>(E));
     Out << "E";
@@ -4056,6 +4081,7 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
   }
 
   case Expr::DesignatedInitExprClass: {
+    NotPrimaryExpr();
     auto *DIE = cast<DesignatedInitExpr>(E);
     for (const auto &Designator : DIE->designators()) {
       if (Designator.isFieldDesignator()) {
@@ -4077,27 +4103,27 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
   }
 
   case Expr::CXXDefaultArgExprClass:
-    mangleExpression(cast<CXXDefaultArgExpr>(E)->getExpr(), Arity);
-    break;
+    E = cast<CXXDefaultArgExpr>(E)->getExpr();
+    goto recurse;
 
   case Expr::CXXDefaultInitExprClass:
-    mangleExpression(cast<CXXDefaultInitExpr>(E)->getExpr(), Arity);
-    break;
+    E = cast<CXXDefaultInitExpr>(E)->getExpr();
+    goto recurse;
 
   case Expr::CXXStdInitializerListExprClass:
-    mangleExpression(cast<CXXStdInitializerListExpr>(E)->getSubExpr(), Arity);
-    break;
+    E = cast<CXXStdInitializerListExpr>(E)->getSubExpr();
+    goto recurse;
 
   case Expr::SubstNonTypeTemplateParmExprClass:
-    mangleExpression(cast<SubstNonTypeTemplateParmExpr>(E)->getReplacement(),
-                     Arity);
-    break;
+    E = cast<SubstNonTypeTemplateParmExpr>(E)->getReplacement();
+    goto recurse;
 
   case Expr::UserDefinedLiteralClass:
     // We follow g++'s approach of mangling a UDL as a call to the literal
     // operator.
   case Expr::CXXMemberCallExprClass: // fallthrough
   case Expr::CallExprClass: {
+    NotPrimaryExpr();
     const CallExpr *CE = cast<CallExpr>(E);
 
     // <expression> ::= cp <simple-id> <expression>* E
@@ -4128,6 +4154,7 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
   }
 
   case Expr::CXXNewExprClass: {
+    NotPrimaryExpr();
     const CXXNewExpr *New = cast<CXXNewExpr>(E);
     if (New->isGlobalNew()) Out << "gs";
     Out << (New->isArray() ? "na" : "nw");
@@ -4163,6 +4190,7 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
   }
 
   case Expr::CXXPseudoDestructorExprClass: {
+    NotPrimaryExpr();
     const auto *PDE = cast<CXXPseudoDestructorExpr>(E);
     if (const Expr *Base = PDE->getBase())
       mangleMemberExprBase(Base, PDE->isArrow());
@@ -4189,6 +4217,7 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
   }
 
   case Expr::MemberExprClass: {
+    NotPrimaryExpr();
     const MemberExpr *ME = cast<MemberExpr>(E);
     mangleMemberExpr(ME->getBase(), ME->isArrow(),
                      ME->getQualifier(), nullptr,
@@ -4199,6 +4228,7 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
   }
 
   case Expr::UnresolvedMemberExprClass: {
+    NotPrimaryExpr();
     const UnresolvedMemberExpr *ME = cast<UnresolvedMemberExpr>(E);
     mangleMemberExpr(ME->isImplicitAccess() ? nullptr : ME->getBase(),
                      ME->isArrow(), ME->getQualifier(), nullptr,
@@ -4209,6 +4239,7 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
   }
 
   case Expr::CXXDependentScopeMemberExprClass: {
+    NotPrimaryExpr();
     const CXXDependentScopeMemberExpr *ME
       = cast<CXXDependentScopeMemberExpr>(E);
     mangleMemberExpr(ME->isImplicitAccess() ? nullptr : ME->getBase(),
@@ -4221,6 +4252,7 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
   }
 
   case Expr::UnresolvedLookupExprClass: {
+    NotPrimaryExpr();
     const UnresolvedLookupExpr *ULE = cast<UnresolvedLookupExpr>(E);
     mangleUnresolvedName(ULE->getQualifier(), ULE->getName(),
                          ULE->getTemplateArgs(), ULE->getNumTemplateArgs(),
@@ -4229,6 +4261,7 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
   }
 
   case Expr::CXXUnresolvedConstructExprClass: {
+    NotPrimaryExpr();
     const CXXUnresolvedConstructExpr *CE = cast<CXXUnresolvedConstructExpr>(E);
     unsigned N = CE->getNumArgs();
 
@@ -4239,7 +4272,7 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
       mangleType(CE->getType());
       mangleInitListElements(IL);
       Out << "E";
-      return;
+      break;
     }
 
     Out << "cv";
@@ -4251,14 +4284,17 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
   }
 
   case Expr::CXXConstructExprClass: {
+    // An implicit cast is silent, thus may contain <expr-primary>.
     const auto *CE = cast<CXXConstructExpr>(E);
     if (!CE->isListInitialization() || CE->isStdInitListInitialization()) {
       assert(
           CE->getNumArgs() >= 1 &&
           (CE->getNumArgs() == 1 || isa<CXXDefaultArgExpr>(CE->getArg(1))) &&
           "implicit CXXConstructExpr must have one argument");
-      return mangleExpression(cast<CXXConstructExpr>(E)->getArg(0));
+      E = cast<CXXConstructExpr>(E)->getArg(0);
+      goto recurse;
     }
+    NotPrimaryExpr();
     Out << "il";
     for (auto *E : CE->arguments())
       mangleExpression(E);
@@ -4267,6 +4303,7 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
   }
 
   case Expr::CXXTemporaryObjectExprClass: {
+    NotPrimaryExpr();
     const auto *CE = cast<CXXTemporaryObjectExpr>(E);
     unsigned N = CE->getNumArgs();
     bool List = CE->isListInitialization();
@@ -4296,17 +4333,20 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
   }
 
   case Expr::CXXScalarValueInitExprClass:
+    NotPrimaryExpr();
     Out << "cv";
     mangleType(E->getType());
     Out << "_E";
     break;
 
   case Expr::CXXNoexceptExprClass:
+    NotPrimaryExpr();
     Out << "nx";
     mangleExpression(cast<CXXNoexceptExpr>(E)->getOperand());
     break;
 
   case Expr::UnaryExprOrTypeTraitExprClass: {
+    // Non-instantiation-dependent traits are an <expr-primary> integer literal.
     const UnaryExprOrTypeTraitExpr *SAE = cast<UnaryExprOrTypeTraitExpr>(E);
 
     if (!SAE->isInstantiationDependent()) {
@@ -4326,6 +4366,8 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
       break;
     }
 
+    NotPrimaryExpr(); // But otherwise, they are not.
+
     auto MangleAlignofSizeofArg = [&] {
       if (SAE->isArgumentType()) {
         Out << 't';
@@ -4380,6 +4422,7 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
   }
 
   case Expr::CXXThrowExprClass: {
+    NotPrimaryExpr();
     const CXXThrowExpr *TE = cast<CXXThrowExpr>(E);
     //  <expression> ::= tw <expression>  # throw expression
     //               ::= tr               # rethrow
@@ -4393,6 +4436,7 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
   }
 
   case Expr::CXXTypeidExprClass: {
+    NotPrimaryExpr();
     const CXXTypeidExpr *TIE = cast<CXXTypeidExpr>(E);
     //  <expression> ::= ti <type>        # typeid (type)
     //               ::= te <expression>  # typeid (expression)
@@ -4407,6 +4451,7 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
   }
 
   case Expr::CXXDeleteExprClass: {
+    NotPrimaryExpr();
     const CXXDeleteExpr *DE = cast<CXXDeleteExpr>(E);
     //  <expression> ::= [gs] dl <expression>  # [::] delete expr
     //               ::= [gs] da <expression>  # [::] delete [] expr
@@ -4417,6 +4462,7 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
   }
 
   case Expr::UnaryOperatorClass: {
+    NotPrimaryExpr();
     const UnaryOperator *UO = cast<UnaryOperator>(E);
     mangleOperatorName(UnaryOperator::getOverloadedOperator(UO->getOpcode()),
                        /*Arity=*/1);
@@ -4425,6 +4471,7 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
   }
 
   case Expr::ArraySubscriptExprClass: {
+    NotPrimaryExpr();
     const ArraySubscriptExpr *AE = cast<ArraySubscriptExpr>(E);
 
     // Array subscript is treated as a syntactically weird form of
@@ -4436,6 +4483,7 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
   }
 
   case Expr::MatrixSubscriptExprClass: {
+    NotPrimaryExpr();
     const MatrixSubscriptExpr *ME = cast<MatrixSubscriptExpr>(E);
     Out << "ixix";
     mangleExpression(ME->getBase());
@@ -4446,6 +4494,7 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
 
   case Expr::CompoundAssignOperatorClass: // fallthrough
   case Expr::BinaryOperatorClass: {
+    NotPrimaryExpr();
     const BinaryOperator *BO = cast<BinaryOperator>(E);
     if (BO->getOpcode() == BO_PtrMemD)
       Out << "ds";
@@ -4458,6 +4507,7 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
   }
 
   case Expr::CXXRewrittenBinaryOperatorClass: {
+    NotPrimaryExpr();
     // The mangled form represents the original syntax.
     CXXRewrittenBinaryOperator::DecomposedForm Decomposed =
         cast<CXXRewrittenBinaryOperator>(E)->getDecomposedForm();
@@ -4469,6 +4519,7 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
   }
 
   case Expr::ConditionalOperatorClass: {
+    NotPrimaryExpr();
     const ConditionalOperator *CO = cast<ConditionalOperator>(E);
     mangleOperatorName(OO_Conditional, /*Arity=*/3);
     mangleExpression(CO->getCond());
@@ -4484,19 +4535,22 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
   }
 
   case Expr::ObjCBridgedCastExprClass: {
+    NotPrimaryExpr();
     // Mangle ownership casts as a vendor extended operator __bridge,
     // __bridge_transfer, or __bridge_retain.
     StringRef Kind = cast<ObjCBridgedCastExpr>(E)->getBridgeKindName();
     Out << "v1U" << Kind.size() << Kind;
+    mangleCastExpression(E, "cv");
+    break;
   }
-  // Fall through to mangle the cast itself.
-  LLVM_FALLTHROUGH;
 
   case Expr::CStyleCastExprClass:
+    NotPrimaryExpr();
     mangleCastExpression(E, "cv");
     break;
 
   case Expr::CXXFunctionalCastExprClass: {
+    NotPrimaryExpr();
     auto *Sub = cast<ExplicitCastExpr>(E)->getSubExpr()->IgnoreImplicit();
     // FIXME: Add isImplicit to CXXConstructExpr.
     if (auto *CCE = dyn_cast<CXXConstructExpr>(Sub))
@@ -4516,22 +4570,28 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
   }
 
   case Expr::CXXStaticCastExprClass:
+    NotPrimaryExpr();
     mangleCastExpression(E, "sc");
     break;
   case Expr::CXXDynamicCastExprClass:
+    NotPrimaryExpr();
     mangleCastExpression(E, "dc");
     break;
   case Expr::CXXReinterpretCastExprClass:
+    NotPrimaryExpr();
     mangleCastExpression(E, "rc");
     break;
   case Expr::CXXConstCastExprClass:
+    NotPrimaryExpr();
     mangleCastExpression(E, "cc");
     break;
   case Expr::CXXAddrspaceCastExprClass:
+    NotPrimaryExpr();
     mangleCastExpression(E, "ac");
     break;
 
   case Expr::CXXOperatorCallExprClass: {
+    NotPrimaryExpr();
     const CXXOperatorCallExpr *CE = cast<CXXOperatorCallExpr>(E);
     unsigned NumArgs = CE->getNumArgs();
     // A CXXOperatorCallExpr for OO_Arrow models only semantics, not syntax
@@ -4545,9 +4605,8 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
   }
 
   case Expr::ParenExprClass:
-    mangleExpression(cast<ParenExpr>(E)->getSubExpr(), Arity);
-    break;
-
+    E = cast<ParenExpr>(E)->getSubExpr();
+    goto recurse;
 
   case Expr::ConceptSpecializationExprClass: {
     //  <expr-primary> ::= L <mangled-name> E # external name
@@ -4561,10 +4620,12 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
   }
 
   case Expr::DeclRefExprClass:
-    mangleDeclRefExpr(cast<DeclRefExpr>(E)->getDecl());
+    // MangleDeclRefExpr helper handles primary-vs-nonprimary
+    MangleDeclRefExpr(cast<DeclRefExpr>(E)->getDecl());
     break;
 
   case Expr::SubstNonTypeTemplateParmPackExprClass:
+    NotPrimaryExpr();
     // FIXME: not clear how to mangle this!
     // template <unsigned N...> class A {
     //   template <class U...> void foo(U (&x)[N]...);
@@ -4573,14 +4634,16 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
     break;
 
   case Expr::FunctionParmPackExprClass: {
+    NotPrimaryExpr();
     // FIXME: not clear how to mangle this!
     const FunctionParmPackExpr *FPPE = cast<FunctionParmPackExpr>(E);
     Out << "v110_SUBSTPACK";
-    mangleDeclRefExpr(FPPE->getParameterPack());
+    MangleDeclRefExpr(FPPE->getParameterPack());
     break;
   }
 
   case Expr::DependentScopeDeclRefExprClass: {
+    NotPrimaryExpr();
     const DependentScopeDeclRefExpr *DRE = cast<DependentScopeDeclRefExpr>(E);
     mangleUnresolvedName(DRE->getQualifier(), DRE->getDeclName(),
                          DRE->getTemplateArgs(), DRE->getNumTemplateArgs(),
@@ -4589,24 +4652,27 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
   }
 
   case Expr::CXXBindTemporaryExprClass:
-    mangleExpression(cast<CXXBindTemporaryExpr>(E)->getSubExpr());
-    break;
+    E = cast<CXXBindTemporaryExpr>(E)->getSubExpr();
+    goto recurse;
 
   case Expr::ExprWithCleanupsClass:
-    mangleExpression(cast<ExprWithCleanups>(E)->getSubExpr(), Arity);
-    break;
+    E = cast<ExprWithCleanups>(E)->getSubExpr();
+    goto recurse;
 
   case Expr::FloatingLiteralClass: {
+    // <expr-primary>
     const FloatingLiteral *FL = cast<FloatingLiteral>(E);
     mangleFloatLiteral(FL->getType(), FL->getValue());
     break;
   }
 
   case Expr::FixedPointLiteralClass:
+    // Currently unimplemented -- might be <expr-primary> in future?
     mangleFixedPointLiteral();
     break;
 
   case Expr::CharacterLiteralClass:
+    // <expr-primary>
     Out << 'L';
     mangleType(E->getType());
     Out << cast<CharacterLiteral>(E)->getValue();
@@ -4615,18 +4681,21 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
 
   // FIXME. __objc_yes/__objc_no are mangled same as true/false
   case Expr::ObjCBoolLiteralExprClass:
+    // <expr-primary>
     Out << "Lb";
     Out << (cast<ObjCBoolLiteralExpr>(E)->getValue() ? '1' : '0');
     Out << 'E';
     break;
 
   case Expr::CXXBoolLiteralExprClass:
+    // <expr-primary>
     Out << "Lb";
     Out << (cast<CXXBoolLiteralExpr>(E)->getValue() ? '1' : '0');
     Out << 'E';
     break;
 
   case Expr::IntegerLiteralClass: {
+    // <expr-primary>
     llvm::APSInt Value(cast<IntegerLiteral>(E)->getValue());
     if (E->getType()->isSignedIntegerType())
       Value.setIsSigned(true);
@@ -4635,6 +4704,7 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
   }
 
   case Expr::ImaginaryLiteralClass: {
+    // <expr-primary>
     const ImaginaryLiteral *IE = cast<ImaginaryLiteral>(E);
     // Mangle as if a complex literal.
     // Proposal from David Vandevoorde, 2010.06.30.
@@ -4658,6 +4728,7 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
   }
 
   case Expr::StringLiteralClass: {
+    // <expr-primary>
     // Revised proposal from David Vandervoorde, 2010.07.15.
     Out << 'L';
     assert(isa<ConstantArrayType>(E->getType()));
@@ -4667,21 +4738,25 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
   }
 
   case Expr::GNUNullExprClass:
+    // <expr-primary>
     // Mangle as if an integer literal 0.
     mangleIntegerLiteral(E->getType(), llvm::APSInt(32));
     break;
 
   case Expr::CXXNullPtrLiteralExprClass: {
+    // <expr-primary>
     Out << "LDnE";
     break;
   }
 
   case Expr::PackExpansionExprClass:
+    NotPrimaryExpr();
     Out << "sp";
     mangleExpression(cast<PackExpansionExpr>(E)->getPattern());
     break;
 
   case Expr::SizeOfPackExprClass: {
+    NotPrimaryExpr();
     auto *SPE = cast<SizeOfPackExpr>(E);
     if (SPE->isPartiallySubstituted()) {
       Out << "sP";
@@ -4706,12 +4781,12 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
     break;
   }
 
-  case Expr::MaterializeTemporaryExprClass: {
-    mangleExpression(cast<MaterializeTemporaryExpr>(E)->getSubExpr());
-    break;
-  }
+  case Expr::MaterializeTemporaryExprClass:
+    E = cast<MaterializeTemporaryExpr>(E)->getSubExpr();
+    goto recurse;
 
   case Expr::CXXFoldExprClass: {
+    NotPrimaryExpr();
     auto *FE = cast<CXXFoldExpr>(E);
     if (FE->isLeftFold())
       Out << (FE->getInit() ? "fL" : "fl");
@@ -4733,27 +4808,34 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) {
   }
 
   case Expr::CXXThisExprClass:
+    NotPrimaryExpr();
     Out << "fpT";
     break;
 
   case Expr::CoawaitExprClass:
     // FIXME: Propose a non-vendor mangling.
+    NotPrimaryExpr();
     Out << "v18co_await";
     mangleExpression(cast<CoawaitExpr>(E)->getOperand());
     break;
 
   case Expr::DependentCoawaitExprClass:
     // FIXME: Propose a non-vendor mangling.
+    NotPrimaryExpr();
     Out << "v18co_await";
     mangleExpression(cast<DependentCoawaitExpr>(E)->getOperand());
     break;
 
   case Expr::CoyieldExprClass:
     // FIXME: Propose a non-vendor mangling.
+    NotPrimaryExpr();
     Out << "v18co_yield";
     mangleExpression(cast<CoawaitExpr>(E)->getOperand());
     break;
   }
+
+  if (AsTemplateArg && !IsPrimaryExpr)
+    Out << 'E';
 }
 
 /// Mangle an expression which refers to a parameter variable.
@@ -5003,10 +5085,9 @@ void CXXNameMangler::mangleTemplateArg(TemplateArgument A, bool NeedExactType) {
     Out << "Dp";
     mangleType(A.getAsTemplateOrTemplatePattern());
     break;
-  case TemplateArgument::Expression: {
+  case TemplateArgument::Expression:
     mangleTemplateArgExpr(A.getAsExpr());
     break;
-  }
   case TemplateArgument::Integral:
     mangleIntegerLiteral(A.getIntegralType(), A.getAsIntegral());
     break;
@@ -5062,9 +5143,22 @@ void CXXNameMangler::mangleTemplateArg(TemplateArgument A, bool NeedExactType) {
 }
 
 void CXXNameMangler::mangleTemplateArgExpr(const Expr *E) {
-  // It's possible to end up with a DeclRefExpr here in certain
-  // dependent cases, in which case we should mangle as a
-  // declaration.
+  ASTContext &Ctx = Context.getASTContext();
+  if (Ctx.getLangOpts().getClangABICompat() > LangOptions::ClangABI::Ver11) {
+    mangleExpression(E, UnknownArity, /*AsTemplateArg=*/true);
+    return;
+  }
+
+  // Prior to Clang 12, we didn't omit the X .. E around <expr-primary>
+  // correctly in cases where the template argument was
+  // constructed from an expression rather than an already-evaluated
+  // literal. In such a case, we would then e.g. emit 'XLi0EE' instead of
+  // 'Li0E'.
+  //
+  // We did special-case DeclRefExpr to attempt to DTRT for that one
+  // expression-kind, but while doing so, unfortunately handled ParmVarDecl
+  // (subtype of VarDecl) _incorrectly_, and emitted 'L_Z .. E' instead of
+  // the proper 'Xfp_E'.
   E = E->IgnoreParenImpCasts();
   if (const DeclRefExpr *DRE = dyn_cast<DeclRefExpr>(E)) {
     const ValueDecl *D = DRE->getDecl();
@@ -5075,7 +5169,6 @@ void CXXNameMangler::mangleTemplateArgExpr(const Expr *E) {
       return;
     }
   }
-
   Out << 'X';
   mangleExpression(E);
   Out << 'E';
diff --git a/clang/test/CodeGenCXX/clang-abi-compat.cpp b/clang/test/CodeGenCXX/clang-abi-compat.cpp
index 46e7ed812cbc..caf06bd5f9f6 100644
--- a/clang/test/CodeGenCXX/clang-abi-compat.cpp
+++ b/clang/test/CodeGenCXX/clang-abi-compat.cpp
@@ -1,12 +1,23 @@
-// RUN: %clang_cc1 -std=c++98 -triple x86_64-linux-gnu -fclang-abi-compat=3.0 %s -emit-llvm -o -    | FileCheck --check-prefixes=CHECK,PRE39,PRE5,PRE12 %s
-// RUN: %clang_cc1 -std=c++17 -triple x86_64-linux-gnu -fclang-abi-compat=3.0 %s -emit-llvm -o -    | FileCheck --check-prefixes=CHECK,PRE39,PRE5,PRE12 %s
-// RUN: %clang_cc1 -std=c++17 -triple x86_64-linux-gnu -fclang-abi-compat=3.8 %s -emit-llvm -o -    | FileCheck --check-prefixes=CHECK,PRE39,PRE5,PRE12 %s
-// RUN: %clang_cc1 -std=c++17 -triple x86_64-linux-gnu -fclang-abi-compat=3.9 %s -emit-llvm -o -    | FileCheck --check-prefixes=CHECK,V39,PRE5,PRE12 %s
-// RUN: %clang_cc1 -std=c++17 -triple x86_64-linux-gnu -fclang-abi-compat=4.0 %s -emit-llvm -o -    | FileCheck --check-prefixes=CHECK,V39,PRE5,PRE12 %s
-// RUN: %clang_cc1 -std=c++17 -triple x86_64-linux-gnu -fclang-abi-compat=5 %s -emit-llvm -o -      | FileCheck --check-prefixes=CHECK,V39,V5,PRE12,PRE12-CXX17 %s
-// RUN: %clang_cc1 -std=c++17 -triple x86_64-linux-gnu -fclang-abi-compat=11 %s -emit-llvm -o -     | FileCheck --check-prefixes=CHECK,V39,V5,PRE12,PRE12-CXX17 %s
-// RUN: %clang_cc1 -std=c++98 -triple x86_64-linux-gnu -fclang-abi-compat=latest %s -emit-llvm -o - | FileCheck --check-prefixes=CHECK,V39,V5,V12 %s
-// RUN: %clang_cc1 -std=c++20 -triple x86_64-linux-gnu -fclang-abi-compat=latest %s -emit-llvm -o - | FileCheck --check-prefixes=CHECK,V39,V5,V12,V12-CXX17 %s
+// RUN: %clang_cc1 -std=c++98 -triple x86_64-linux-gnu -fenable-matrix -fclang-abi-compat=3.0 %s -emit-llvm -o - -Wno-c++11-extensions \
+// RUN:     | FileCheck --check-prefixes=CHECK,PRE39,PRE5,PRE12 %s
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-linux-gnu -fenable-matrix -fclang-abi-compat=3.0 %s -emit-llvm -o - \
+// RUN:     | FileCheck --check-prefixes=CHECK,PRE39,PRE5,PRE12 %s
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-linux-gnu -fenable-matrix -fclang-abi-compat=3.8 %s -emit-llvm -o - \
+// RUN:     | FileCheck --check-prefixes=CHECK,PRE39,PRE5,PRE12 %s
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-linux-gnu -fenable-matrix -fclang-abi-compat=3.9 %s -emit-llvm -o - \
+// RUN:     | FileCheck --check-prefixes=CHECK,V39,PRE5,PRE12 %s
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-linux-gnu -fenable-matrix -fclang-abi-compat=4.0 %s -emit-llvm -o - \
+// RUN:     | FileCheck --check-prefixes=CHECK,V39,PRE5,PRE12 %s
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-linux-gnu -fenable-matrix -fclang-abi-compat=5 %s -emit-llvm -o - \
+// RUN:     | FileCheck --check-prefixes=CHECK,V39,V5,PRE12,PRE12-CXX17 %s
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-linux-gnu -fenable-matrix -fclang-abi-compat=11 %s -emit-llvm -o - \
+// RUN:     | FileCheck --check-prefixes=CHECK,V39,V5,PRE12,PRE12-CXX17 %s
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-linux-gnu -fenable-matrix -fclang-abi-compat=11 %s -emit-llvm -o - \
+// RUN:     | FileCheck --check-prefixes=CHECK,V39,V5,PRE12,PRE12-CXX17,PRE12-CXX20 %s
+// RUN: %clang_cc1 -std=c++98 -triple x86_64-linux-gnu -fenable-matrix -fclang-abi-compat=latest %s -emit-llvm -o - -Wno-c++11-extensions \
+// RUN:     | FileCheck --check-prefixes=CHECK,V39,V5,V12 %s
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-linux-gnu -fenable-matrix -fclang-abi-compat=latest %s -emit-llvm -o - \
+// RUN:     | FileCheck --check-prefixes=CHECK,V39,V5,V12,V12-CXX17,V12-CXX20 %s
 
 typedef __attribute__((vector_size(8))) long long v1xi64;
 void clang39(v1xi64) {}
@@ -55,3 +66,68 @@ template void clang12_b<arr>();
 // CHECK: @_Z9clang12_cIXadL_Z3arrEEEvv
 template<const char (*)[6]> void clang12_c() {}
 template void clang12_c<&arr>();
+
+
+/// Tests for <template-arg> <expr-primary> changes in clang12:
+namespace expr_primary {
+struct A {
+  template<int N> struct Int {};
+  template<int& N> struct Ref {};
+};
+
+/// Check various DeclRefExpr manglings
+
+// PRE12: @_ZN12expr_primary5test1INS_1AEEEvNT_3IntIXLi1EEEE
+// V12:   @_ZN12expr_primary5test1INS_1AEEEvNT_3IntILi1EEE
+template <typename T> void test1(typename T::template Int<1> a) {}
+template void test1<A>(typename A::template Int<1> a);
+
+enum Enum { EnumVal = 4 };
+int Global;
+
+// PRE12: @_ZN12expr_primary5test2INS_1AEEEvNT_3IntIXLNS_4EnumE4EEEE
+// V12:   @_ZN12expr_primary5test2INS_1AEEEvNT_3IntILNS_4EnumE4EEE
+template <typename T> void test2(typename T::template Int<EnumVal> a) {}
+template void test2<A>(typename A::template Int<4> a);
+
+// CHECK: @_ZN12expr_primary5test3ILi3EEEvNS_1A3IntIXT_EEE
+template <int X> void test3(typename A::template Int<X> a) {}
+template void test3<3>(A::Int<3> a);
+
+#if __cplusplus >= 202002L
+// CHECK-CXX20: @_ZN12expr_primary5test4INS_1AEEEvNT_3RefIL_ZNS_6GlobalEEEE
+template <typename T> void test4(typename T::template Ref<(Global)> a) {}
+template void test4<A>(typename A::template Ref<Global> a);
+
+struct B {
+  struct X {
+    constexpr X(double) {}
+    constexpr X(int&) {}
+  };
+  template<X> struct Y {};
+};
+
+// PRE12-CXX20: _ZN12expr_primary5test5INS_1BEEEvNT_1YIXLd3ff0000000000000EEEE
+// V12-CXX20: _ZN12expr_primary5test5INS_1BEEEvNT_1YILd3ff0000000000000EEE
+template<typename T> void test5(typename T::template Y<1.0>) { }
+template void test5<B>(typename B::Y<1.0>);
+
+// PRE12-CXX20: @_ZN12expr_primary5test6INS_1BEEENT_1YIL_ZZNS_5test6EiE1bEEEi
+// V12-CXX20:   @_ZN12expr_primary5test6INS_1BEEENT_1YIXfp_EEEi
+template<typename T> auto test6(int b) -> typename T::template Y<b> { return {}; }
+template auto test6<B>(int b) -> B::Y<b>;
+#endif
+
+/// Verify non-dependent type-traits within a dependent template arg.
+
+// PRE12: @_ZN12expr_primary5test7INS_1AEEEvNT_3IntIXLm1EEEE
+// V12:   @_ZN12expr_primary5test7INS_1AEEEvNT_3IntILm1EEE
+template <class T> void test7(typename T::template Int<sizeof(char)> a) {}
+template void test7<A>(A::Int<1>);
+
+// PRE12: @_ZN12expr_primary5test8ILi2EEEvu11matrix_typeIXLi1EEXT_EiE
+// V12:   @_ZN12expr_primary5test8ILi2EEEvu11matrix_typeILi1EXT_EiE
+template<int N> using matrix1xN = int __attribute__((matrix_type(1, N)));
+template<int N> void test8(matrix1xN<N> a) {}
+template void test8<2>(matrix1xN<2> a);
+}
diff --git a/clang/test/CodeGenCXX/mangle-abi-tag.cpp b/clang/test/CodeGenCXX/mangle-abi-tag.cpp
index 5d84096d24cd..9e26604a2c44 100644
--- a/clang/test/CodeGenCXX/mangle-abi-tag.cpp
+++ b/clang/test/CodeGenCXX/mangle-abi-tag.cpp
@@ -225,7 +225,7 @@ namespace pr30440 {
 template<class F> void g(F);
 template<class ...A> auto h(A ...a)->decltype (g (0, g < a > (a) ...)) {
 }
-// CHECK-DAG: define {{.*}} @_ZN7pr304401hIJEEEDTcl1gLi0Espcl1gIL_ZZNS_1hEDpT_E1aEEfp_EEES2_(
+// CHECK-DAG: define {{.*}} @_ZN7pr304401hIJEEEDTcl1gLi0Espcl1gIXfp_EEfp_EEEDpT_(
 
 void pr30440_test () {
   h();
diff --git a/clang/test/CodeGenCXX/mangle-concept.cpp b/clang/test/CodeGenCXX/mangle-concept.cpp
index b0fcd586727e..e60e6348a5f6 100644
--- a/clang/test/CodeGenCXX/mangle-concept.cpp
+++ b/clang/test/CodeGenCXX/mangle-concept.cpp
@@ -6,11 +6,11 @@ template <bool> struct S {};
 template <typename> concept C = true;
 template <typename T = int> S<C<T>> f0() { return S<C<T>>{}; }
 template S<C<int>> f0<>();
-// CHECK: @_ZN5test12f0IiEENS_1SIXL_ZNS_1CIT_EEEEEEv(
+// CHECK: @_ZN5test12f0IiEENS_1SIL_ZNS_1CIT_EEEEEv(
 }
 
 template <bool> struct S {};
 template <typename> concept C = true;
 template <typename T = int> S<C<T>> f0() { return S<C<T>>{}; }
 template S<C<int>> f0<>();
-// CHECK: @_Z2f0IiE1SIXL_Z1CIT_EEEEv(
+// CHECK: @_Z2f0IiE1SIL_Z1CIT_EEEv(
diff --git a/clang/test/CodeGenCXX/mangle-template.cpp b/clang/test/CodeGenCXX/mangle-template.cpp
index 40688de7e12e..9b80a6d64695 100644
--- a/clang/test/CodeGenCXX/mangle-template.cpp
+++ b/clang/test/CodeGenCXX/mangle-template.cpp
@@ -270,7 +270,7 @@ namespace test17 {
   // Note: there is no J...E here, because we can't form a pack argument, and
   // the 5u and 6u are mangled with the original type 'j' (unsigned int) not
   // with the resolved type 'i' (signed int).
-  // CHECK: define {{.*}} @_ZN6test171hILi4EJLi1ELi2ELi3EEEEvNS_1XIXspT0_EXLj5EEXT_EXLj6EEEE
+  // CHECK: define {{.*}} @_ZN6test171hILi4EJLi1ELi2ELi3EEEEvNS_1XIXspT0_ELj5EXT_ELj6EEE
   template<int D, int ...C> void h(X<C..., 5u, D, 6u>) {}
   void i() { h<4, 1, 2, 3>({}); }
 
@@ -323,7 +323,7 @@ namespace partially_dependent_template_args {
     // callee is unresolved, the rest mangle the converted argument Lj0E
     // because the callee is resolved.
     void h() {
-      // CHECK: @_ZN33partially_dependent_template_args5test22g1INS0_1XEEEvDTcl1fIXLi0EEEcvT__EEE
+      // CHECK: @_ZN33partially_dependent_template_args5test22g1INS0_1XEEEvDTcl1fILi0EEcvT__EEE
       g1<X>({});
       // CHECK: @_ZN33partially_dependent_template_args5test22g2IiEEvDTplclL_ZNS0_1fILj0EEEiNS0_1XEEilEEcvT__EE
       g2<int>({});
diff --git a/clang/test/CodeGenCXX/mangle.cpp b/clang/test/CodeGenCXX/mangle.cpp
index f8ea9960a5c5..6cec33e3758e 100644
--- a/clang/test/CodeGenCXX/mangle.cpp
+++ b/clang/test/CodeGenCXX/mangle.cpp
@@ -1123,7 +1123,7 @@ namespace test56 {
 namespace test57 {
   struct X { template <int N> int f(); } x;
   template<int N> void f(decltype(x.f<0>() + N)) {}
-  // CHECK-LABEL: @_ZN6test571fILi0EEEvDTplcldtL_ZNS_1xEE1fIXLi0EEEET_E
+  // CHECK-LABEL: @_ZN6test571fILi0EEEvDTplcldtL_ZNS_1xEE1fILi0EEET_E
   template void f<0>(int);
 }
 
diff --git a/clang/test/CodeGenCXX/matrix-type.cpp b/clang/test/CodeGenCXX/matrix-type.cpp
index 9bde12e13b86..9e715e10ce1c 100644
--- a/clang/test/CodeGenCXX/matrix-type.cpp
+++ b/clang/test/CodeGenCXX/matrix-type.cpp
@@ -215,14 +215,14 @@ void test_template_deduction() {
   // CHECK-NEXT:    %m4 = alloca [144 x float], align 4
   // CHECK-NEXT:    %v = alloca %struct.selector.3, align 1
   // CHECK-NEXT:    %undef.agg.tmp4 = alloca %struct.selector.3, align 1
-  // CHECK-NEXT:    call void @_Z10use_matrixIiLm12EE8selectorILi3EERu11matrix_typeIXLm10EEXT0_ET_E([120 x i32]* nonnull align 4 dereferenceable(480) %m0)
+  // CHECK-NEXT:    call void @_Z10use_matrixIiLm12EE8selectorILi3EERu11matrix_typeILm10EXT0_ET_E([120 x i32]* nonnull align 4 dereferenceable(480) %m0)
   // CHECK-NEXT:    call void @_Z10use_matrixIiE8selectorILi2EERu11matrix_typeILm10ELm10ET_E([100 x i32]* nonnull align 4 dereferenceable(400) %m1)
-  // CHECK-NEXT:    call void @_Z10use_matrixIiLm12EE8selectorILi1EERu11matrix_typeIXT0_EXLm10EET_E([120 x i32]* nonnull align 4 dereferenceable(480) %m2)
+  // CHECK-NEXT:    call void @_Z10use_matrixIiLm12EE8selectorILi1EERu11matrix_typeIXT0_ELm10ET_E([120 x i32]* nonnull align 4 dereferenceable(480) %m2)
   // CHECK-NEXT:    call void @_Z10use_matrixIiLm12ELm12EE8selectorILi0EERu11matrix_typeIXT0_EXT1_ET_E([144 x i32]* nonnull align 4 dereferenceable(576) %m3)
   // CHECK-NEXT:    call void @_Z10use_matrixILm12ELm12EE8selectorILi4EERu11matrix_typeIXT_EXT0_EfE([144 x float]* nonnull align 4 dereferenceable(576) %m4)
   // CHECK-NEXT:    ret void
 
-  // CHECK-LABEL: define linkonce_odr void @_Z10use_matrixIiLm12EE8selectorILi3EERu11matrix_typeIXLm10EEXT0_ET_E([120 x i32]* nonnull align 4 dereferenceable(480) %m)
+  // CHECK-LABEL: define linkonce_odr void @_Z10use_matrixIiLm12EE8selectorILi3EERu11matrix_typeILm10EXT0_ET_E([120 x i32]* nonnull align 4 dereferenceable(480) %m)
   // CHECK-NEXT:  entry:
   // CHECK-NEXT:    %m.addr = alloca [120 x i32]*, align 8
   // CHECK-NEXT:    store [120 x i32]* %m, [120 x i32]** %m.addr, align 8
@@ -236,7 +236,7 @@ void test_template_deduction() {
   // CHECK-NEXT:    call void @llvm.trap()
   // CHECK-NEXT:    unreachable
 
-  // CHECK-LABEL: define linkonce_odr void @_Z10use_matrixIiLm12EE8selectorILi1EERu11matrix_typeIXT0_EXLm10EET_E([120 x i32]* nonnull align 4 dereferenceable(480) %m)
+  // CHECK-LABEL: define linkonce_odr void @_Z10use_matrixIiLm12EE8selectorILi1EERu11matrix_typeIXT0_ELm10ET_E([120 x i32]* nonnull align 4 dereferenceable(480) %m)
   // CHECK-NEXT:  entry:
   // CHECK-NEXT:    %m.addr = alloca [120 x i32]*, align 8
   // CHECK-NEXT:    store [120 x i32]* %m, [120 x i32]** %m.addr, align 8
@@ -277,10 +277,10 @@ void test_auto_t() {
   // CHECK-LABEL: define{{.*}} void @_Z11test_auto_tv()
   // CHECK-NEXT:  entry:
   // CHECK-NEXT:    %m = alloca [130 x i32], align 4
-  // CHECK-NEXT:    call void @_Z3fooILm13EEvRu11matrix_typeIXT_EXLm10EEiE([130 x i32]* nonnull align 4 dereferenceable(520) %m)
+  // CHECK-NEXT:    call void @_Z3fooILm13EEvRu11matrix_typeIXT_ELm10EiE([130 x i32]* nonnull align 4 dereferenceable(520) %m)
   // CHECK-NEXT:    ret void
 
-  // CHECK-LABEL: define linkonce_odr void @_Z3fooILm13EEvRu11matrix_typeIXT_EXLm10EEiE([130 x i32]* nonnull align 4 dereferenceable(520) %m)
+  // CHECK-LABEL: define linkonce_odr void @_Z3fooILm13EEvRu11matrix_typeIXT_ELm10EiE([130 x i32]* nonnull align 4 dereferenceable(520) %m)
   // CHECK-NEXT:  entry:
   // CHECK-NEXT:    %m.addr = alloca [130 x i32]*, align 8
   // CHECK-NEXT:    store [130 x i32]* %m, [130 x i32]** %m.addr, align 8
@@ -326,7 +326,7 @@ void test_use_matrix_2() {
   // CHECK-NEXT:    store <40 x float> %call, <40 x float>* %0, align 4
   // CHECK-NEXT:    call void @_Z12use_matrix_2ILm2ELm12EE8selectorILi0EERu11matrix_typeIXplT_Li2EEXdvT0_Li2EEiERu11matrix_typeIXT_EXT0_EfE([24 x i32]* nonnull align 4 dereferenceable(96) %m1, [24 x float]* nonnull align 4 dereferenceable(96) %m2)
   // CHECK-NEXT:    call void @_Z12use_matrix_2ILm5ELm8EE8selectorILi1EERu11matrix_typeIXplT_T0_EXT0_EiERu11matrix_typeIXT_EXmiT0_T_EfE([104 x i32]* nonnull align 4 dereferenceable(416) %m3, [15 x float]* nonnull align 4 dereferenceable(60) %m4)
-  // CHECK-NEXT:    %call2 = call <20 x float> @_Z12use_matrix_2ILm5EEu11matrix_typeIXplT_T_EXmiT_Li3EEfERu11matrix_typeIXT_EXLm10EEiE([50 x i32]* nonnull align 4 dereferenceable(200) %m5)
+  // CHECK-NEXT:    %call2 = call <20 x float> @_Z12use_matrix_2ILm5EEu11matrix_typeIXplT_T_EXmiT_Li3EEfERu11matrix_typeIXT_ELm10EiE([50 x i32]* nonnull align 4 dereferenceable(200) %m5)
   // CHECK-NEXT:    %1 = bitcast [20 x float]* %r4 to <20 x float>*
   // CHECK-NEXT:    store <20 x float> %call2, <20 x float>* %1, align 4
   // CHECK-NEXT:    call void @_Z12use_matrix_3ILm6EE8selectorILi2EERu11matrix_typeIXmiT_Li2EEXT_EiE([24 x i32]* nonnull align 4 dereferenceable(96) %m1)
@@ -357,7 +357,7 @@ void test_use_matrix_2() {
   // CHECK-NEXT:    call void @llvm.trap()
   // CHECK-NEXT:    unreachable
 
-  // CHECK-LABEL: define linkonce_odr <20 x float> @_Z12use_matrix_2ILm5EEu11matrix_typeIXplT_T_EXmiT_Li3EEfERu11matrix_typeIXT_EXLm10EEiE([50 x i32]* nonnull align 4 dereferenceable(200) %m1)
+  // CHECK-LABEL: define linkonce_odr <20 x float> @_Z12use_matrix_2ILm5EEu11matrix_typeIXplT_T_EXmiT_Li3EEfERu11matrix_typeIXT_ELm10EiE([50 x i32]* nonnull align 4 dereferenceable(200) %m1)
   // CHECK-NEXT:  entry:
   // CHECK-NEXT:    %m1.addr = alloca [50 x i32]*, align 8
   // CHECK-NEXT:    store [50 x i32]* %m1, [50 x i32]** %m1.addr, align 8
diff --git a/clang/test/CodeGenCXX/microsoft-uuidof-mangling.cpp b/clang/test/CodeGenCXX/microsoft-uuidof-mangling.cpp
index 321f65cacc71..5c02b1eb014c 100644
--- a/clang/test/CodeGenCXX/microsoft-uuidof-mangling.cpp
+++ b/clang/test/CodeGenCXX/microsoft-uuidof-mangling.cpp
@@ -60,14 +60,12 @@ int main(int argc, const char * argv[])
 // CHECK-V11: call void @_Z15test_uuidofExprI9HasMemberEvDTu8__uuidofzsrT_6memberE(
 // CHECK-V12: call void @_Z15test_uuidofExprI9HasMemberEvDTu8__uuidofXsrT_6memberEEE(
 // CHECK-V11: call void @_Z16test_uuidofExpr2I10TestStructEvDTcmtlT_Eu8__uuidofzL_ZN9HasMember6memberEEE(
-// CHECK-V12: call void @_Z16test_uuidofExpr2I10TestStructEvDTcmtlT_Eu8__uuidofXL_ZN9HasMember6memberEEEEE(
-//    TODO: the above mangling is wrong -- the X/E shouldn't be emitted:       ^                     ^
+// CHECK-V12: call void @_Z16test_uuidofExpr2I10TestStructEvDTcmtlT_Eu8__uuidofL_ZN9HasMember6memberEEEE(
 // CHECK: define linkonce_odr void @_ZN8UUIDTestI10TestStructL_Z42_GUID_eafa1952_66f8_438b_8fba_af1bbae42191EEC1Ev
 // CHECK-V11: define linkonce_odr void @_Z15test_uuidofTypeI10TestStructEvDTu8__uuidoftT_E(
 // CHECK-V12: define linkonce_odr void @_Z15test_uuidofTypeI10TestStructEvDTu8__uuidofT_EE(
 // CHECK-V11: define linkonce_odr void @_Z15test_uuidofExprI9HasMemberEvDTu8__uuidofzsrT_6memberE(
 // CHECK-V12: define linkonce_odr void @_Z15test_uuidofExprI9HasMemberEvDTu8__uuidofXsrT_6memberEEE(
 // CHECK-V11: define linkonce_odr void @_Z16test_uuidofExpr2I10TestStructEvDTcmtlT_Eu8__uuidofzL_ZN9HasMember6memberEEE(
-// CHECK-V12: define linkonce_odr void @_Z16test_uuidofExpr2I10TestStructEvDTcmtlT_Eu8__uuidofXL_ZN9HasMember6memberEEEEE(
-//    TODO: the above mangling is wrong -- the X/E shouldn't be emitted:                      ^                     ^
+// CHECK-V12: define linkonce_odr void @_Z16test_uuidofExpr2I10TestStructEvDTcmtlT_Eu8__uuidofL_ZN9HasMember6memberEEEE(
 // CHECK: define linkonce_odr void @_ZN8UUIDTestI10TestStructL_Z42_GUID_eafa1952_66f8_438b_8fba_af1bbae42191EEC2Ev

From 0b7b698fecd37415a635a586e5ca159ab0b8872f Mon Sep 17 00:00:00 2001
From: James Y Knight <jyknight@google.com>
Date: Sun, 24 Jan 2021 16:23:58 -0500
Subject: [PATCH 011/318] Itanium Mangling: In 'enable_if', omit X/E around
 <expr-primary>.

The Clang enable_if extension is mangled as an <extended-qualifier>,
which is supposed to contain <template-args>. However, we were
unconditionally emitting X/E around its arguments, neglecting the fact
that <expr-primary> should be emitted directly without the surrounding
X/E.

Differential Revision: https://reviews.llvm.org/D95488

(cherry picked from commit a7246ba02a8923f316419a62d836dbe1c0b437bd)
---
 clang/lib/AST/ItaniumMangle.cpp            | 14 +++++++--
 clang/test/CodeGen/enable_if.c             | 34 +++++++++++-----------
 clang/test/CodeGenCXX/clang-abi-compat.cpp |  5 ++++
 clang/test/CodeGenCXX/enable_if.cpp        |  2 +-
 4 files changed, 34 insertions(+), 21 deletions(-)

diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 54e2f361a9f1..4420f6a2c1c3 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -727,9 +727,17 @@ void CXXNameMangler::mangleFunctionEncodingBareType(const FunctionDecl *FD) {
       EnableIfAttr *EIA = dyn_cast<EnableIfAttr>(*I);
       if (!EIA)
         continue;
-      Out << 'X';
-      mangleExpression(EIA->getCond());
-      Out << 'E';
+      if (Context.getASTContext().getLangOpts().getClangABICompat() >
+          LangOptions::ClangABI::Ver11) {
+        mangleTemplateArgExpr(EIA->getCond());
+      } else {
+        // Prior to Clang 12, we hardcoded the X/E around enable-if's argument,
+        // even though <template-arg> should not include an X/E around
+        // <expr-primary>.
+        Out << 'X';
+        mangleExpression(EIA->getCond());
+        Out << 'E';
+      }
     }
     Out << 'E';
     FunctionTypeDepth.pop(Saved);
diff --git a/clang/test/CodeGen/enable_if.c b/clang/test/CodeGen/enable_if.c
index 14550b9e2db9..327a201cdeba 100644
--- a/clang/test/CodeGen/enable_if.c
+++ b/clang/test/CodeGen/enable_if.c
@@ -31,22 +31,22 @@ void bar(int m) __attribute__((overloadable, enable_if(m > 0, "")));
 void bar(int m) __attribute__((overloadable, enable_if(1, "")));
 // CHECK-LABEL: define{{.*}} void @test2
 void test2() {
-  // CHECK: store void (i32)* @_Z3barUa9enable_ifIXLi1EEEi
+  // CHECK: store void (i32)* @_Z3barUa9enable_ifILi1EEi
   void (*p)(int) = bar;
-  // CHECK: store void (i32)* @_Z3barUa9enable_ifIXLi1EEEi
+  // CHECK: store void (i32)* @_Z3barUa9enable_ifILi1EEi
   void (*p2)(int) = &bar;
-  // CHECK: store void (i32)* @_Z3barUa9enable_ifIXLi1EEEi
+  // CHECK: store void (i32)* @_Z3barUa9enable_ifILi1EEi
   p = bar;
-  // CHECK: store void (i32)* @_Z3barUa9enable_ifIXLi1EEEi
+  // CHECK: store void (i32)* @_Z3barUa9enable_ifILi1EEi
   p = &bar;
 
-  // CHECK: store i8* bitcast (void (i32)* @_Z3barUa9enable_ifIXLi1EEEi to i8*)
+  // CHECK: store i8* bitcast (void (i32)* @_Z3barUa9enable_ifILi1EEi to i8*)
   void *vp1 = (void*)&bar;
-  // CHECK: store i8* bitcast (void (i32)* @_Z3barUa9enable_ifIXLi1EEEi to i8*)
+  // CHECK: store i8* bitcast (void (i32)* @_Z3barUa9enable_ifILi1EEi to i8*)
   void *vp2 = (void*)bar;
-  // CHECK: store i8* bitcast (void (i32)* @_Z3barUa9enable_ifIXLi1EEEi to i8*)
+  // CHECK: store i8* bitcast (void (i32)* @_Z3barUa9enable_ifILi1EEi to i8*)
   vp1 = (void*)&bar;
-  // CHECK: store i8* bitcast (void (i32)* @_Z3barUa9enable_ifIXLi1EEEi to i8*)
+  // CHECK: store i8* bitcast (void (i32)* @_Z3barUa9enable_ifILi1EEi to i8*)
   vp1 = (void*)bar;
 }
 
@@ -54,13 +54,13 @@ void baz(int m) __attribute__((overloadable, enable_if(1, "")));
 void baz(int m) __attribute__((overloadable));
 // CHECK-LABEL: define{{.*}} void @test3
 void test3() {
-  // CHECK: store void (i32)* @_Z3bazUa9enable_ifIXLi1EEEi
+  // CHECK: store void (i32)* @_Z3bazUa9enable_ifILi1EEi
   void (*p)(int) = baz;
-  // CHECK: store void (i32)* @_Z3bazUa9enable_ifIXLi1EEEi
+  // CHECK: store void (i32)* @_Z3bazUa9enable_ifILi1EEi
   void (*p2)(int) = &baz;
-  // CHECK: store void (i32)* @_Z3bazUa9enable_ifIXLi1EEEi
+  // CHECK: store void (i32)* @_Z3bazUa9enable_ifILi1EEi
   p = baz;
-  // CHECK: store void (i32)* @_Z3bazUa9enable_ifIXLi1EEEi
+  // CHECK: store void (i32)* @_Z3bazUa9enable_ifILi1EEi
   p = &baz;
 }
 
@@ -71,13 +71,13 @@ void qux(int m) __attribute__((overloadable, enable_if(1, ""),
 void qux(int m) __attribute__((overloadable, enable_if(1, "")));
 // CHECK-LABEL: define{{.*}} void @test4
 void test4() {
-  // CHECK: store void (i32)* @_Z3quxUa9enable_ifIXLi1EEXLi1EEEi
+  // CHECK: store void (i32)* @_Z3quxUa9enable_ifILi1ELi1EEi
   void (*p)(int) = qux;
-  // CHECK: store void (i32)* @_Z3quxUa9enable_ifIXLi1EEXLi1EEEi
+  // CHECK: store void (i32)* @_Z3quxUa9enable_ifILi1ELi1EEi
   void (*p2)(int) = &qux;
-  // CHECK: store void (i32)* @_Z3quxUa9enable_ifIXLi1EEXLi1EEEi
+  // CHECK: store void (i32)* @_Z3quxUa9enable_ifILi1ELi1EEi
   p = qux;
-  // CHECK: store void (i32)* @_Z3quxUa9enable_ifIXLi1EEXLi1EEEi
+  // CHECK: store void (i32)* @_Z3quxUa9enable_ifILi1ELi1EEi
   p = &qux;
 }
 
@@ -90,6 +90,6 @@ void test5() {
   int foo(char *i __attribute__((pass_object_size(0))))
       __attribute__((enable_if(1, ""), overloadable));
 
-  // CHECK: call i32 @_Z3fooUa9enable_ifIXLi1EEEPcU17pass_object_size0
+  // CHECK: call i32 @_Z3fooUa9enable_ifILi1EEPcU17pass_object_size0
   foo((void*)0);
 }
diff --git a/clang/test/CodeGenCXX/clang-abi-compat.cpp b/clang/test/CodeGenCXX/clang-abi-compat.cpp
index caf06bd5f9f6..80311aa320fe 100644
--- a/clang/test/CodeGenCXX/clang-abi-compat.cpp
+++ b/clang/test/CodeGenCXX/clang-abi-compat.cpp
@@ -130,4 +130,9 @@ template void test7<A>(A::Int<1>);
 template<int N> using matrix1xN = int __attribute__((matrix_type(1, N)));
 template<int N> void test8(matrix1xN<N> a) {}
 template void test8<2>(matrix1xN<2> a);
+
+// PRE12: @_ZN12expr_primary5test9EUa9enable_ifIXLi1EEEv
+// V12:   @_ZN12expr_primary5test9EUa9enable_ifILi1EEv
+void test9(void) __attribute__((enable_if(1, ""))) {}
+
 }
diff --git a/clang/test/CodeGenCXX/enable_if.cpp b/clang/test/CodeGenCXX/enable_if.cpp
index 4e7707aaeed9..70386b87fcee 100644
--- a/clang/test/CodeGenCXX/enable_if.cpp
+++ b/clang/test/CodeGenCXX/enable_if.cpp
@@ -5,7 +5,7 @@ int test5(int);
 template <typename T>
 T test5(T) __attribute__((enable_if(1, "better than non-template")));
 
-// CHECK: @_Z5test5IiEUa9enable_ifIXLi1EEET_S0_
+// CHECK: @_Z5test5IiEUa9enable_ifILi1EET_S0_
 int (*Ptr)(int) = &test5;
 
 // Test itanium mangling for attribute enable_if

From de3396d89d998769c3310c23bdd49babade9d874 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Thu, 28 Jan 2021 15:30:21 -0800
Subject: [PATCH 012/318] workflows: Update branch names

Also remove main-brancy-sync workflow that was removed from the main branch.
---
 .github/workflows/clang-tests.yml      |  6 +++---
 .github/workflows/libclc-tests.yml     |  6 +++---
 .github/workflows/lld-tests.yml        |  6 +++---
 .github/workflows/lldb-tests.yml       |  6 +++---
 .github/workflows/llvm-tests.yml       | 10 +++++-----
 .github/workflows/main-branch-sync.yml | 25 -------------------------
 6 files changed, 17 insertions(+), 42 deletions(-)
 delete mode 100644 .github/workflows/main-branch-sync.yml

diff --git a/.github/workflows/clang-tests.yml b/.github/workflows/clang-tests.yml
index f8ca65e10726..af0b5eabeeda 100644
--- a/.github/workflows/clang-tests.yml
+++ b/.github/workflows/clang-tests.yml
@@ -28,16 +28,16 @@ jobs:
     steps:
     - name: Setup Windows
       if: startsWith(matrix.os, 'windows')
-      uses: llvm/actions/setup-windows@master
+      uses: llvm/actions/setup-windows@main
       with:
         arch: amd64
     - name: Install Ninja
-      uses: llvm/actions/install-ninja@master
+      uses: llvm/actions/install-ninja@main
     - uses: actions/checkout@v1
       with:
         fetch-depth: 1
     - name: Test clang
-      uses: llvm/actions/build-test-llvm-project@master
+      uses: llvm/actions/build-test-llvm-project@main
       with:
         cmake_args: -G Ninja  -DLLVM_ENABLE_PROJECTS="clang" -DCMAKE_BUILD_TYPE=Release
         build_target: check-clang
diff --git a/.github/workflows/libclc-tests.yml b/.github/workflows/libclc-tests.yml
index 4e8639b1c89a..2f1eb2939ea2 100644
--- a/.github/workflows/libclc-tests.yml
+++ b/.github/workflows/libclc-tests.yml
@@ -31,16 +31,16 @@ jobs:
     steps:
     - name: Setup Windows
       if: startsWith(matrix.os, 'windows')
-      uses: llvm/actions/setup-windows@master
+      uses: llvm/actions/setup-windows@main
       with:
         arch: amd64
     - name: Install Ninja
-      uses: llvm/actions/install-ninja@master
+      uses: llvm/actions/install-ninja@main
     - uses: actions/checkout@v1
       with:
         fetch-depth: 1
     - name: Build clang
-      uses: llvm/actions/build-test-llvm-project@master
+      uses: llvm/actions/build-test-llvm-project@main
       with:
         cmake_args: -G Ninja  -DLLVM_ENABLE_PROJECTS="clang" -DCMAKE_BUILD_TYPE=Release
         build_target: ""
diff --git a/.github/workflows/lld-tests.yml b/.github/workflows/lld-tests.yml
index 9b4cbe95f231..bdf0c2fcd886 100644
--- a/.github/workflows/lld-tests.yml
+++ b/.github/workflows/lld-tests.yml
@@ -28,16 +28,16 @@ jobs:
     steps:
     - name: Setup Windows
       if: startsWith(matrix.os, 'windows')
-      uses: llvm/actions/setup-windows@master
+      uses: llvm/actions/setup-windows@main
       with:
         arch: amd64
     - name: Install Ninja
-      uses: llvm/actions/install-ninja@master
+      uses: llvm/actions/install-ninja@main
     - uses: actions/checkout@v1
       with:
         fetch-depth: 1
     - name: Test lld
-      uses: llvm/actions/build-test-llvm-project@master
+      uses: llvm/actions/build-test-llvm-project@main
       with:
         cmake_args: -G Ninja  -DLLVM_ENABLE_PROJECTS="lld" -DCMAKE_BUILD_TYPE=Release
         build_target: check-lld
diff --git a/.github/workflows/lldb-tests.yml b/.github/workflows/lldb-tests.yml
index 229e6deece6e..93fddc2de8c6 100644
--- a/.github/workflows/lldb-tests.yml
+++ b/.github/workflows/lldb-tests.yml
@@ -31,16 +31,16 @@ jobs:
     steps:
     - name: Setup Windows
       if: startsWith(matrix.os, 'windows')
-      uses: llvm/actions/setup-windows@master
+      uses: llvm/actions/setup-windows@main
       with:
         arch: amd64
     - name: Install Ninja
-      uses: llvm/actions/install-ninja@master
+      uses: llvm/actions/install-ninja@main
     - uses: actions/checkout@v1
       with:
         fetch-depth: 1
     - name: Build lldb
-      uses: llvm/actions/build-test-llvm-project@master
+      uses: llvm/actions/build-test-llvm-project@main
       with:
         # Mac OS requries that libcxx is enabled for lldb tests, so we need  to disable them.
         cmake_args: -G Ninja  -DLLVM_ENABLE_PROJECTS="clang;lldb" -DCMAKE_BUILD_TYPE=Release -DLLDB_INCLUDE_TESTS=OFF
diff --git a/.github/workflows/llvm-tests.yml b/.github/workflows/llvm-tests.yml
index 67f318ad849f..675383407d64 100644
--- a/.github/workflows/llvm-tests.yml
+++ b/.github/workflows/llvm-tests.yml
@@ -29,16 +29,16 @@ jobs:
     steps:
     - name: Setup Windows
       if: startsWith(matrix.os, 'windows')
-      uses: llvm/actions/setup-windows@master
+      uses: llvm/actions/setup-windows@main
       with:
         arch: amd64
     - name: Install Ninja
-      uses: llvm/actions/install-ninja@master
+      uses: llvm/actions/install-ninja@main
     - uses: actions/checkout@v1
       with:
         fetch-depth: 1
     - name: Test llvm
-      uses: llvm/actions/build-test-llvm-project@master
+      uses: llvm/actions/build-test-llvm-project@main
       with:
         cmake_args: -G Ninja -DCMAKE_BUILD_TYPE=Release
 
@@ -60,7 +60,7 @@ jobs:
             repo: ${{ github.repository }}
     steps:
     - name: Install Ninja
-      uses: llvm/actions/install-ninja@master
+      uses: llvm/actions/install-ninja@main
     - name: Install abi-compliance-checker
       run: |
         sudo apt-get install abi-dumper autoconf pkg-config
@@ -72,7 +72,7 @@ jobs:
         ./configure
         sudo make install
     - name: Download source code
-      uses: llvm/actions/get-llvm-project-src@master
+      uses: llvm/actions/get-llvm-project-src@main
       with:
         ref: ${{ matrix.ref }}
         repo: ${{ matrix.repo }}
diff --git a/.github/workflows/main-branch-sync.yml b/.github/workflows/main-branch-sync.yml
deleted file mode 100644
index 5ea360e281d6..000000000000
--- a/.github/workflows/main-branch-sync.yml
+++ /dev/null
@@ -1,25 +0,0 @@
-name: main branch sync
-
-on:
-  push:
-    branches:
-      - 'main'
-
-jobs:
-  branch_sync:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout Code
-        uses: actions/checkout@v2
-        with:
-          # persist-credentials: false allows us to use our own credentials for
-          # pushing to the repository.  Otherwise, the default github actions token
-          # is used.
-          persist-credentials: false
-          fetch-depth: 0
-
-      - name: Update branch
-        env:
-          LLVMBOT_TOKEN: ${{ secrets.LLVMBOT_MAIN_SYNC }}
-        run: |
-          git push https://$LLVMBOT_TOKEN@github.com/${{ github.repository }} HEAD:master

From 0a32d93bd95b7ad0a4c7f91955c6c815150df84c Mon Sep 17 00:00:00 2001
From: Marek Kurdej <marek.kurdej@gmail.com>
Date: Wed, 27 Jan 2021 09:14:22 +0100
Subject: [PATCH 013/318] [clang-format] Avoid considering include directive as
 a template closer.

This fixes a bug [[ http://llvm.org/PR48891 | PR48891 ]] introduced in D93839 where:
```
#include <stdint.h>
namespace rep {}
```
got formatted as
```
#include <stdint.h>
namespace rep {
}
```

Reviewed By: MyDeveloperDay, leonardchan

Differential Revision: https://reviews.llvm.org/D95479

(cherry picked from commit e3713f156b8cb65a2b74f150afb824ce1e2a2fab)
---
 clang/lib/Format/UnwrappedLineFormatter.cpp |  2 +-
 clang/unittests/Format/FormatTest.cpp       | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Format/UnwrappedLineFormatter.cpp b/clang/lib/Format/UnwrappedLineFormatter.cpp
index d1138bbc9c36..5dd0ccdfa6fd 100644
--- a/clang/lib/Format/UnwrappedLineFormatter.cpp
+++ b/clang/lib/Format/UnwrappedLineFormatter.cpp
@@ -371,7 +371,7 @@ class LineJoiner {
         if (Previous->is(tok::comment))
           Previous = Previous->getPreviousNonComment();
         if (Previous) {
-          if (Previous->is(tok::greater))
+          if (Previous->is(tok::greater) && !I[-1]->InPPDirective)
             return 0;
           if (Previous->is(tok::identifier)) {
             const FormatToken *PreviousPrevious =
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 855cf0242fe9..c1f88b9ae17a 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -10248,6 +10248,21 @@ TEST_F(FormatTest, SplitEmptyClass) {
                "{\n"
                "};",
                Style);
+
+  verifyFormat("#include \"stdint.h\"\n"
+               "namespace rep {}",
+               Style);
+  verifyFormat("#include <stdint.h>\n"
+               "namespace rep {}",
+               Style);
+  verifyFormat("#include <stdint.h>\n"
+               "namespace rep {}",
+               "#include <stdint.h>\n"
+               "namespace rep {\n"
+               "\n"
+               "\n"
+               "}",
+               Style);
 }
 
 TEST_F(FormatTest, SplitEmptyStruct) {

From 8c5d184ef714dcf435784e21e66b4b5e25b2dffb Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Wed, 27 Jan 2021 16:51:27 -0500
Subject: [PATCH 014/318] clang: Fix static_assert in a few contexts in
 microsoft mode

Follow-up to D17444. Fixes PR48904. See bug for details.

Differential Revision: https://reviews.llvm.org/D95559

(cherry picked from commit 764a7a2155c6747ec8d0b38d8edbb65960eae874)
---
 clang/lib/Parse/ParseDecl.cpp   |  3 ++-
 clang/test/Sema/static-assert.c | 11 +++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index 571164139630..347d992b1643 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -4216,7 +4216,7 @@ void Parser::ParseStructUnionBody(SourceLocation RecordLoc,
     }
 
     // Parse _Static_assert declaration.
-    if (Tok.is(tok::kw__Static_assert)) {
+    if (Tok.isOneOf(tok::kw__Static_assert, tok::kw_static_assert)) {
       SourceLocation DeclEnd;
       ParseStaticAssertDeclaration(DeclEnd);
       continue;
@@ -5180,6 +5180,7 @@ bool Parser::isDeclarationSpecifier(bool DisambiguatingWithExpression) {
   case tok::kw_friend:
 
     // static_assert-declaration
+  case tok::kw_static_assert:
   case tok::kw__Static_assert:
 
     // GNU typeof support.
diff --git a/clang/test/Sema/static-assert.c b/clang/test/Sema/static-assert.c
index f08e557fc8ea..9105f2366985 100644
--- a/clang/test/Sema/static-assert.c
+++ b/clang/test/Sema/static-assert.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -std=c11 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fms-compatibility -DMS -fsyntax-only -verify %s
 // RUN: %clang_cc1 -std=c99 -pedantic -fsyntax-only -verify=expected,ext %s
 // RUN: %clang_cc1 -xc++ -std=c++11 -pedantic -fsyntax-only -verify=expected,ext,cxx %s
 
@@ -11,10 +12,17 @@ _Static_assert(1, "1 is nonzero"); // ext-warning {{'_Static_assert' is a C11 ex
 _Static_assert(0, "0 is nonzero"); // expected-error {{static_assert failed "0 is nonzero"}} \
                                    // ext-warning {{'_Static_assert' is a C11 extension}}
 
+#ifdef MS
+static_assert(1, "1 is nonzero");
+#endif
+
 void foo(void) {
   _Static_assert(1, "1 is nonzero"); // ext-warning {{'_Static_assert' is a C11 extension}}
   _Static_assert(0, "0 is nonzero"); // expected-error {{static_assert failed "0 is nonzero"}} \
                                      // ext-warning {{'_Static_assert' is a C11 extension}}
+#ifdef MS
+  static_assert(1, "1 is nonzero");
+#endif
 }
 
 _Static_assert(1, invalid); // expected-error {{expected string literal for diagnostic message in static_assert}} \
@@ -25,6 +33,9 @@ struct A {
   _Static_assert(1, "1 is nonzero"); // ext-warning {{'_Static_assert' is a C11 extension}}
   _Static_assert(0, "0 is nonzero"); // expected-error {{static_assert failed "0 is nonzero"}} \
                                      // ext-warning {{'_Static_assert' is a C11 extension}}
+#ifdef MS
+  static_assert(1, "1 is nonzero");
+#endif
 };
 
 #ifdef __cplusplus

From 1edbbf9d20d9f859f7ff2a146a501aeb1423141e Mon Sep 17 00:00:00 2001
From: Sam McCall <sam.mccall@gmail.com>
Date: Wed, 20 Jan 2021 12:38:32 +0100
Subject: [PATCH 015/318] [clangd] Log warning when using legacy (theia)
 semantic highlighting.

The legacy protocol will be removed on trunk after the 12 branch cut,
and gone in clangd 13.

Differential Revision: https://reviews.llvm.org/D95031

(cherry picked from commit 29472bb76915c4929aecc938300f6df31f63ac29)
---
 clang-tools-extra/clangd/ClangdLSPServer.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/clang-tools-extra/clangd/ClangdLSPServer.cpp b/clang-tools-extra/clangd/ClangdLSPServer.cpp
index dc89ebd59fe2..35aed2166f03 100644
--- a/clang-tools-extra/clangd/ClangdLSPServer.cpp
+++ b/clang-tools-extra/clangd/ClangdLSPServer.cpp
@@ -510,6 +510,11 @@ void ClangdLSPServer::onInitialize(const InitializeParams &Params,
         "semanticTokens request, choosing the latter (no notifications).");
     Opts.TheiaSemanticHighlighting = false;
   }
+  if (Opts.TheiaSemanticHighlighting) {
+    log("Using legacy semanticHighlights notification, which will be removed "
+        "in clangd 13. Clients should use the standard semanticTokens "
+        "request instead.");
+  }
 
   if (Params.rootUri && *Params.rootUri)
     Opts.WorkspaceRoot = std::string(Params.rootUri->file());

From 61e05d1bc1af737c5f24fd5cd765f1a9914cbd13 Mon Sep 17 00:00:00 2001
From: Sam McCall <sam.mccall@gmail.com>
Date: Mon, 25 Jan 2021 16:16:22 +0100
Subject: [PATCH 016/318] [clangd] Parse Diagnostics block, and nest ClangTidy
 block under it.

(ClangTidy configuration block hasn't been in any release, so we should be OK
to move it around like this)

Differential Revision: https://reviews.llvm.org/D95362

(cherry picked from commit c3df9d58c75e0f89ca95e947804d65e79a491adc)
---
 clang-tools-extra/clangd/Config.h             | 15 +++---
 clang-tools-extra/clangd/ConfigCompile.cpp    | 14 +++---
 clang-tools-extra/clangd/ConfigFragment.h     | 47 +++++++++----------
 clang-tools-extra/clangd/ConfigYAML.cpp       | 14 +++++-
 clang-tools-extra/clangd/TidyProvider.cpp     |  2 +-
 .../clangd/unittests/ConfigCompileTests.cpp   | 32 +++++++------
 .../clangd/unittests/ConfigYAMLTests.cpp      | 11 +++--
 7 files changed, 75 insertions(+), 60 deletions(-)

diff --git a/clang-tools-extra/clangd/Config.h b/clang-tools-extra/clangd/Config.h
index 44ca283b6a0e..391632cb086a 100644
--- a/clang-tools-extra/clangd/Config.h
+++ b/clang-tools-extra/clangd/Config.h
@@ -90,6 +90,13 @@ struct Config {
   struct {
     bool SuppressAll = false;
     llvm::StringSet<> Suppress;
+
+    /// Configures what clang-tidy checks to run and options to use with them.
+    struct {
+      // A comma-seperated list of globs specify which clang-tidy checks to run.
+      std::string Checks;
+      llvm::StringMap<std::string> CheckOptions;
+    } ClangTidy;
   } Diagnostics;
 
   /// Style of the codebase.
@@ -99,14 +106,6 @@ struct Config {
     // ::). All nested namespaces are affected as well.
     std::vector<std::string> FullyQualifiedNamespaces;
   } Style;
-
-  /// Configures what clang-tidy checks to run and options to use with them.
-  struct {
-    // A comma-seperated list of globs to specify which clang-tidy checks to
-    // run.
-    std::string Checks;
-    llvm::StringMap<std::string> CheckOptions;
-  } ClangTidy;
 };
 
 } // namespace clangd
diff --git a/clang-tools-extra/clangd/ConfigCompile.cpp b/clang-tools-extra/clangd/ConfigCompile.cpp
index e82c6e159421..8682cae36f26 100644
--- a/clang-tools-extra/clangd/ConfigCompile.cpp
+++ b/clang-tools-extra/clangd/ConfigCompile.cpp
@@ -189,7 +189,6 @@ struct FragmentCompiler {
     compile(std::move(F.CompileFlags));
     compile(std::move(F.Index));
     compile(std::move(F.Diagnostics));
-    compile(std::move(F.ClangTidy));
   }
 
   void compile(Fragment::IfBlock &&F) {
@@ -379,6 +378,8 @@ struct FragmentCompiler {
         for (llvm::StringRef N : Normalized)
           C.Diagnostics.Suppress.insert(N);
       });
+
+    compile(std::move(F.ClangTidy));
   }
 
   void compile(Fragment::StyleBlock &&F) {
@@ -422,7 +423,7 @@ struct FragmentCompiler {
     CurSpec += Str;
   }
 
-  void compile(Fragment::ClangTidyBlock &&F) {
+  void compile(Fragment::DiagnosticsBlock::ClangTidyBlock &&F) {
     std::string Checks;
     for (auto &CheckGlob : F.Add)
       appendTidyCheckSpec(Checks, CheckGlob, true);
@@ -433,8 +434,9 @@ struct FragmentCompiler {
     if (!Checks.empty())
       Out.Apply.push_back(
           [Checks = std::move(Checks)](const Params &, Config &C) {
-            C.ClangTidy.Checks.append(
-                Checks, C.ClangTidy.Checks.empty() ? /*skip comma*/ 1 : 0,
+            C.Diagnostics.ClangTidy.Checks.append(
+                Checks,
+                C.Diagnostics.ClangTidy.Checks.empty() ? /*skip comma*/ 1 : 0,
                 std::string::npos);
           });
     if (!F.CheckOptions.empty()) {
@@ -445,8 +447,8 @@ struct FragmentCompiler {
       Out.Apply.push_back(
           [CheckOptions = std::move(CheckOptions)](const Params &, Config &C) {
             for (auto &StringPair : CheckOptions)
-              C.ClangTidy.CheckOptions.insert_or_assign(StringPair.first,
-                                                        StringPair.second);
+              C.Diagnostics.ClangTidy.CheckOptions.insert_or_assign(
+                  StringPair.first, StringPair.second);
           });
     }
   }
diff --git a/clang-tools-extra/clangd/ConfigFragment.h b/clang-tools-extra/clangd/ConfigFragment.h
index 5b67c49fe154..c36b07f5e8e2 100644
--- a/clang-tools-extra/clangd/ConfigFragment.h
+++ b/clang-tools-extra/clangd/ConfigFragment.h
@@ -203,6 +203,29 @@ struct Fragment {
     /// (e.g. by disabling a clang-tidy check, or the -Wunused compile flag).
     /// This often has other advantages, such as skipping some analysis.
     std::vector<Located<std::string>> Suppress;
+
+    /// Controls how clang-tidy will run over the code base.
+    ///
+    /// The settings are merged with any settings found in .clang-tidy
+    /// configiration files with these ones taking precedence.
+    struct ClangTidyBlock {
+      std::vector<Located<std::string>> Add;
+      /// List of checks to disable.
+      /// Takes precedence over Add. To enable all llvm checks except include
+      /// order:
+      ///   Add: llvm-*
+      ///   Remove: llvm-include-onder
+      std::vector<Located<std::string>> Remove;
+
+      /// A Key-Value pair list of options to pass to clang-tidy checks
+      /// These take precedence over options specified in clang-tidy
+      /// configuration files. Example:
+      ///   CheckOptions:
+      ///     readability-braces-around-statements.ShortStatementLines: 2
+      std::vector<std::pair<Located<std::string>, Located<std::string>>>
+          CheckOptions;
+    };
+    ClangTidyBlock ClangTidy;
   };
   DiagnosticsBlock Diagnostics;
 
@@ -215,30 +238,6 @@ struct Fragment {
     std::vector<Located<std::string>> FullyQualifiedNamespaces;
   };
   StyleBlock Style;
-
-  /// Controls how clang-tidy will run over the code base.
-  ///
-  /// The settings are merged with any settings found in .clang-tidy
-  /// configiration files with these ones taking precedence.
-  // FIXME: move this to Diagnostics.Tidy.
-  struct ClangTidyBlock {
-    std::vector<Located<std::string>> Add;
-    /// List of checks to disable.
-    /// Takes precedence over Add. To enable all llvm checks except include
-    /// order:
-    ///   Add: llvm-*
-    ///   Remove: llvm-include-onder
-    std::vector<Located<std::string>> Remove;
-
-    /// A Key-Value pair list of options to pass to clang-tidy checks
-    /// These take precedence over options specified in clang-tidy configuration
-    /// files. Example:
-    ///   CheckOptions:
-    ///     readability-braces-around-statements.ShortStatementLines: 2
-    std::vector<std::pair<Located<std::string>, Located<std::string>>>
-        CheckOptions;
-  };
-  ClangTidyBlock ClangTidy;
 };
 
 } // namespace config
diff --git a/clang-tools-extra/clangd/ConfigYAML.cpp b/clang-tools-extra/clangd/ConfigYAML.cpp
index 7aaff5565497..348ee9dd1f75 100644
--- a/clang-tools-extra/clangd/ConfigYAML.cpp
+++ b/clang-tools-extra/clangd/ConfigYAML.cpp
@@ -62,7 +62,7 @@ class Parser {
     Dict.handle("CompileFlags", [&](Node &N) { parse(F.CompileFlags, N); });
     Dict.handle("Index", [&](Node &N) { parse(F.Index, N); });
     Dict.handle("Style", [&](Node &N) { parse(F.Style, N); });
-    Dict.handle("ClangTidy", [&](Node &N) { parse(F.ClangTidy, N); });
+    Dict.handle("Diagnostics", [&](Node &N) { parse(F.Diagnostics, N); });
     Dict.parse(N);
     return !(N.failed() || HadError);
   }
@@ -110,7 +110,17 @@ class Parser {
     Dict.parse(N);
   }
 
-  void parse(Fragment::ClangTidyBlock &F, Node &N) {
+  void parse(Fragment::DiagnosticsBlock &F, Node &N) {
+    DictParser Dict("Diagnostics", this);
+    Dict.handle("Suppress", [&](Node &N) {
+      if (auto Values = scalarValues(N))
+        F.Suppress = std::move(*Values);
+    });
+    Dict.handle("ClangTidy", [&](Node &N) { parse(F.ClangTidy, N); });
+    Dict.parse(N);
+  }
+
+  void parse(Fragment::DiagnosticsBlock::ClangTidyBlock &F, Node &N) {
     DictParser Dict("ClangTidy", this);
     Dict.handle("Add", [&](Node &N) {
       if (auto Values = scalarValues(N))
diff --git a/clang-tools-extra/clangd/TidyProvider.cpp b/clang-tools-extra/clangd/TidyProvider.cpp
index 0a9f12221287..c26c59fd347d 100644
--- a/clang-tools-extra/clangd/TidyProvider.cpp
+++ b/clang-tools-extra/clangd/TidyProvider.cpp
@@ -255,7 +255,7 @@ TidyProvider disableUnusableChecks(llvm::ArrayRef<std::string> ExtraBadChecks) {
 
 TidyProviderRef provideClangdConfig() {
   return [](tidy::ClangTidyOptions &Opts, llvm::StringRef) {
-    const auto &CurTidyConfig = Config::current().ClangTidy;
+    const auto &CurTidyConfig = Config::current().Diagnostics.ClangTidy;
     if (!CurTidyConfig.Checks.empty())
       mergeCheckList(Opts.Checks, CurTidyConfig.Checks);
 
diff --git a/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp b/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp
index ef24b5d8417f..4b1da2035727 100644
--- a/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp
+++ b/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp
@@ -259,32 +259,36 @@ TEST_F(ConfigCompileTests, DiagnosticSuppression) {
 }
 
 TEST_F(ConfigCompileTests, Tidy) {
-  Frag.ClangTidy.Add.emplace_back("bugprone-use-after-move");
-  Frag.ClangTidy.Add.emplace_back("llvm-*");
-  Frag.ClangTidy.Remove.emplace_back("llvm-include-order");
-  Frag.ClangTidy.Remove.emplace_back("readability-*");
-  Frag.ClangTidy.CheckOptions.emplace_back(
+  auto &Tidy = Frag.Diagnostics.ClangTidy;
+  Tidy.Add.emplace_back("bugprone-use-after-move");
+  Tidy.Add.emplace_back("llvm-*");
+  Tidy.Remove.emplace_back("llvm-include-order");
+  Tidy.Remove.emplace_back("readability-*");
+  Tidy.CheckOptions.emplace_back(
       std::make_pair(std::string("StrictMode"), std::string("true")));
-  Frag.ClangTidy.CheckOptions.emplace_back(std::make_pair(
+  Tidy.CheckOptions.emplace_back(std::make_pair(
       std::string("example-check.ExampleOption"), std::string("0")));
   EXPECT_TRUE(compileAndApply());
   EXPECT_EQ(
-      Conf.ClangTidy.Checks,
+      Conf.Diagnostics.ClangTidy.Checks,
       "bugprone-use-after-move,llvm-*,-llvm-include-order,-readability-*");
-  EXPECT_EQ(Conf.ClangTidy.CheckOptions.size(), 2U);
-  EXPECT_EQ(Conf.ClangTidy.CheckOptions.lookup("StrictMode"), "true");
-  EXPECT_EQ(Conf.ClangTidy.CheckOptions.lookup("example-check.ExampleOption"),
+  EXPECT_EQ(Conf.Diagnostics.ClangTidy.CheckOptions.size(), 2U);
+  EXPECT_EQ(Conf.Diagnostics.ClangTidy.CheckOptions.lookup("StrictMode"),
+            "true");
+  EXPECT_EQ(Conf.Diagnostics.ClangTidy.CheckOptions.lookup(
+                "example-check.ExampleOption"),
             "0");
   EXPECT_THAT(Diags.Diagnostics, IsEmpty());
 }
 
 TEST_F(ConfigCompileTests, TidyBadChecks) {
-  Frag.ClangTidy.Add.emplace_back("unknown-check");
-  Frag.ClangTidy.Remove.emplace_back("*");
-  Frag.ClangTidy.Remove.emplace_back("llvm-includeorder");
+  auto &Tidy = Frag.Diagnostics.ClangTidy;
+  Tidy.Add.emplace_back("unknown-check");
+  Tidy.Remove.emplace_back("*");
+  Tidy.Remove.emplace_back("llvm-includeorder");
   EXPECT_TRUE(compileAndApply());
   // Ensure bad checks are stripped from the glob.
-  EXPECT_EQ(Conf.ClangTidy.Checks, "-*");
+  EXPECT_EQ(Conf.Diagnostics.ClangTidy.Checks, "-*");
   EXPECT_THAT(
       Diags.Diagnostics,
       ElementsAre(
diff --git a/clang-tools-extra/clangd/unittests/ConfigYAMLTests.cpp b/clang-tools-extra/clangd/unittests/ConfigYAMLTests.cpp
index 25d468ba604a..e1c81344de20 100644
--- a/clang-tools-extra/clangd/unittests/ConfigYAMLTests.cpp
+++ b/clang-tools-extra/clangd/unittests/ConfigYAMLTests.cpp
@@ -60,10 +60,11 @@ CompileFlags: { Add: [foo, bar] }
 Index:
   Background: Skip
 ---
-ClangTidy: 
-  CheckOptions: 
-    IgnoreMacros: true
-    example-check.ExampleOption: 0
+Diagnostics:
+  ClangTidy:
+    CheckOptions:
+      IgnoreMacros: true
+      example-check.ExampleOption: 0
   )yaml";
   auto Results = Fragment::parseYAML(YAML, "config.yaml", Diags.callback());
   EXPECT_THAT(Diags.Diagnostics, IsEmpty());
@@ -77,7 +78,7 @@ CompileFlags: { Add: [foo, bar] }
 
   ASSERT_TRUE(Results[2].Index.Background);
   EXPECT_EQ("Skip", *Results[2].Index.Background.getValue());
-  EXPECT_THAT(Results[3].ClangTidy.CheckOptions,
+  EXPECT_THAT(Results[3].Diagnostics.ClangTidy.CheckOptions,
               ElementsAre(PairVal("IgnoreMacros", "true"),
                           PairVal("example-check.ExampleOption", "0")));
 }

From 074ad6de6fae20ff7ff720f79df1d6c1a7845157 Mon Sep 17 00:00:00 2001
From: AndreyChurbanov <andrey.churbanov@intel.com>
Date: Fri, 29 Jan 2021 13:16:41 +0300
Subject: [PATCH 017/318] [OpenMP] libomp: fix build by cl with vs2019

Replace VLA with dynamic allocation using alloca().
This fixes https://bugs.llvm.org/show_bug.cgi?id=48919.

Differential Revision: https://reviews.llvm.org/D95627

(cherry picked from commit 7f5ad0e07162e0c19e569986ee37a17c147c9a27)
---
 openmp/runtime/src/kmp_settings.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp
index a8522130f972..b477edbbfb42 100644
--- a/openmp/runtime/src/kmp_settings.cpp
+++ b/openmp/runtime/src/kmp_settings.cpp
@@ -3355,7 +3355,8 @@ static void __kmp_stg_parse_allocator(char const *name, char const *value,
         ntraits++;
     }
   }
-  omp_alloctrait_t traits[ntraits];
+  omp_alloctrait_t *traits =
+      (omp_alloctrait_t *)KMP_ALLOCA(ntraits * sizeof(omp_alloctrait_t));
 
 // Helper macros
 #define IS_POWER_OF_TWO(n) (((n) & ((n)-1)) == 0)

From 99f43f598907a9cc1a613c691ffbce7c8bd4ec75 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Thu, 28 Jan 2021 14:37:33 +0100
Subject: [PATCH 018/318] Relax test expectations in
 debug-info-gline-tables-only-codeview.cpp

To make it pass also on 32-bit Windows, see PR48920.

(cherry picked from commit 0024efc69ea6cd0b630cd11cef5991b7edb73ffc)
---
 clang/test/CodeGenCXX/debug-info-gline-tables-only-codeview.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/test/CodeGenCXX/debug-info-gline-tables-only-codeview.cpp b/clang/test/CodeGenCXX/debug-info-gline-tables-only-codeview.cpp
index 27ac682c10f5..409b62da62c1 100644
--- a/clang/test/CodeGenCXX/debug-info-gline-tables-only-codeview.cpp
+++ b/clang/test/CodeGenCXX/debug-info-gline-tables-only-codeview.cpp
@@ -25,6 +25,6 @@ void test() {
   // CHECK: ![[C]] = !DICompositeType(tag: DW_TAG_structure_type, name: "C",
   // CHECK-SAME:                      flags: DIFlagFwdDecl
   // CHECK-NOT: identifier
-  // CHECK: ![[MTYPE]] = !DISubroutineType(types: !{{.*}})
+  // CHECK: ![[MTYPE]] = !DISubroutineType({{.*}}types: !{{.*}})
   c.m();
 }

From c5a1eb9b0a76eef7e3025b7333a0d256b8562360 Mon Sep 17 00:00:00 2001
From: Piotr Sobczak <Piotr.Sobczak@amd.com>
Date: Wed, 27 Jan 2021 16:02:49 +0100
Subject: [PATCH 019/318] [AMDGPU] Avoid an illegal operand in
 si-shrink-instructions

Before the patch it was possible to trigger a constant bus
violation when folding immediates into a shrunk instruction.

The patch adds a check to enforce the legality of the new operand.

Differential Revision: https://reviews.llvm.org/D95527

(cherry picked from commit fc8e7411218c846386650cfba111b62827c71da0)
---
 .../Target/AMDGPU/SIShrinkInstructions.cpp    | 24 ++++++++++---------
 .../shrink-instructions-illegal-fold.mir      | 23 ++++++++++++++++++
 2 files changed, 36 insertions(+), 11 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/shrink-instructions-illegal-fold.mir

diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 2628070f219c..cdb78aae1c4f 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -75,17 +75,19 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
         MachineOperand &MovSrc = Def->getOperand(1);
         bool ConstantFolded = false;
 
-        if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) ||
-                               isUInt<32>(MovSrc.getImm()))) {
-          Src0.ChangeToImmediate(MovSrc.getImm());
-          ConstantFolded = true;
-        } else if (MovSrc.isFI()) {
-          Src0.ChangeToFrameIndex(MovSrc.getIndex());
-          ConstantFolded = true;
-        } else if (MovSrc.isGlobal()) {
-          Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(),
-                          MovSrc.getTargetFlags());
-          ConstantFolded = true;
+        if (TII->isOperandLegal(MI, Src0Idx, &MovSrc)) {
+          if (MovSrc.isImm() &&
+              (isInt<32>(MovSrc.getImm()) || isUInt<32>(MovSrc.getImm()))) {
+            Src0.ChangeToImmediate(MovSrc.getImm());
+            ConstantFolded = true;
+          } else if (MovSrc.isFI()) {
+            Src0.ChangeToFrameIndex(MovSrc.getIndex());
+            ConstantFolded = true;
+          } else if (MovSrc.isGlobal()) {
+            Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(),
+                            MovSrc.getTargetFlags());
+            ConstantFolded = true;
+          }
         }
 
         if (ConstantFolded) {
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-instructions-illegal-fold.mir b/llvm/test/CodeGen/AMDGPU/shrink-instructions-illegal-fold.mir
new file mode 100644
index 000000000000..7889f437facf
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/shrink-instructions-illegal-fold.mir
@@ -0,0 +1,23 @@
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=si-shrink-instructions --verify-machineinstrs %s -o - | FileCheck %s
+
+# Make sure immediate folding into V_CNDMASK respects constant bus restrictions.
+---
+
+name:            shrink_cndmask_illegal_imm_folding
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; CHECK-LABEL: name: shrink_cndmask_illegal_imm_folding
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; CHECK: [[MOV:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 32768, implicit $exec
+    ; CHECK: V_CMP_EQ_U32_e32 0, [[COPY]], implicit-def $vcc, implicit $exec
+    ; CHECK: V_CNDMASK_B32_e32 [[MOV]], killed [[COPY]], implicit $vcc, implicit $exec
+
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = V_MOV_B32_e32 32768, implicit $exec
+    V_CMP_EQ_U32_e32 0, %0:vgpr_32, implicit-def $vcc, implicit $exec
+    %2:vgpr_32 = V_CNDMASK_B32_e64 0, %1:vgpr_32, 0, killed %0:vgpr_32, $vcc, implicit $exec
+    S_NOP 0
+
+...

From b2710e7535bd43d9fd6f9792644fe2c207079c42 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dimitry@andric.com>
Date: Thu, 28 Jan 2021 23:53:45 +0100
Subject: [PATCH 020/318] [sanitizer] Fix msan test build on FreeBSD after
 7afdc89c2054

This commit accidentally enabled fgetgrent_r() in the msan tests under
FreeBSD, but this function is not supported. Also remove FreeBSD from
the SANITIZER_INTERCEPT_FGETGRENT_R macro.

(cherry picked from commit e056fc6cb676f72d5b7dfe7ca540b3275bd1a46f)
---
 compiler-rt/lib/msan/tests/msan_test.cpp                        | 2 ++
 .../lib/sanitizer_common/sanitizer_platform_interceptors.h      | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/lib/msan/tests/msan_test.cpp b/compiler-rt/lib/msan/tests/msan_test.cpp
index 7378b237a711..5dc9090f36c0 100644
--- a/compiler-rt/lib/msan/tests/msan_test.cpp
+++ b/compiler-rt/lib/msan/tests/msan_test.cpp
@@ -3707,7 +3707,9 @@ TEST(MemorySanitizer, getgrent_r) {
   EXPECT_NOT_POISONED(grp.gr_gid);
   EXPECT_NOT_POISONED(grpres);
 }
+#endif
 
+#ifdef __GLIBC__
 TEST(MemorySanitizer, fgetgrent_r) {
   FILE *fp = fopen("/etc/group", "r");
   struct group grp;
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
index 7f7b38d4215b..068fc9829e57 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
@@ -226,7 +226,7 @@
   (SI_FREEBSD || SI_NETBSD || SI_MAC || SI_LINUX_NOT_ANDROID || SI_SOLARIS)
 #define SANITIZER_INTERCEPT_GETPWENT \
   (SI_FREEBSD || SI_NETBSD || SI_MAC || SI_LINUX_NOT_ANDROID || SI_SOLARIS)
-#define SANITIZER_INTERCEPT_FGETGRENT_R (SI_FREEBSD || SI_GLIBC || SI_SOLARIS)
+#define SANITIZER_INTERCEPT_FGETGRENT_R (SI_GLIBC || SI_SOLARIS)
 #define SANITIZER_INTERCEPT_FGETPWENT SI_LINUX_NOT_ANDROID || SI_SOLARIS
 #define SANITIZER_INTERCEPT_GETPWENT_R \
   (SI_FREEBSD || SI_NETBSD || SI_GLIBC || SI_SOLARIS)

From 4e20d9c03d9acc9ee5a78cbba82b08d51ecbaf3f Mon Sep 17 00:00:00 2001
From: Petr Hosek <phosek@google.com>
Date: Thu, 28 Jan 2021 19:01:41 -0800
Subject: [PATCH 021/318] Make the profile-filter.c test compatible with 32-bit
 systems

This addresses PR48930.

Differential Revision: https://reviews.llvm.org/D95658

(cherry picked from commit 0217f1c7a31ba44715bc083a60cddc2192ffed96)
---
 clang/test/CodeGen/profile-filter.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/clang/test/CodeGen/profile-filter.c b/clang/test/CodeGen/profile-filter.c
index 5415ff96cb14..dc5a31e872a1 100644
--- a/clang/test/CodeGen/profile-filter.c
+++ b/clang/test/CodeGen/profile-filter.c
@@ -28,11 +28,11 @@ unsigned i;
 // EXCLUDE: noprofile
 // EXCLUDE: @test1
 unsigned test1() {
-  // CHECK: %pgocount = load i64, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc_test1, i64 0, i64 0), align 8
-  // FUNC: %pgocount = load i64, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc_test1, i64 0, i64 0), align 8
-  // FILE: %pgocount = load i64, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc_test1, i64 0, i64 0), align 8
-  // SECTION-NOT: %pgocount = load i64, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc_test1, i64 0, i64 0), align 8
-  // EXCLUDE-NOT: %pgocount = load i64, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc_test1, i64 0, i64 0), align 8
+  // CHECK: %pgocount = load i64, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc_test1, i64 0, i64 0)
+  // FUNC: %pgocount = load i64, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc_test1, i64 0, i64 0)
+  // FILE: %pgocount = load i64, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc_test1, i64 0, i64 0)
+  // SECTION-NOT: %pgocount = load i64, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc_test1, i64 0, i64 0)
+  // EXCLUDE-NOT: %pgocount = load i64, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc_test1, i64 0, i64 0)
   return i + 1;
 }
 
@@ -47,10 +47,10 @@ unsigned test1() {
 // EXCLUDE-NOT: noprofile
 // EXCLUDE: @test2
 unsigned test2() {
-  // CHECK: %pgocount = load i64, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc_test2, i64 0, i64 0), align 8
-  // FUNC-NOT: %pgocount = load i64, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc_test2, i64 0, i64 0), align 8
-  // FILE: %pgocount = load i64, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc_test2, i64 0, i64 0), align 8
-  // SECTION: %pgocount = load i64, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc_test2, i64 0, i64 0), align 8
-  // EXCLUDE: %pgocount = load i64, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc_test2, i64 0, i64 0), align 8
+  // CHECK: %pgocount = load i64, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc_test2, i64 0, i64 0)
+  // FUNC-NOT: %pgocount = load i64, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc_test2, i64 0, i64 0)
+  // FILE: %pgocount = load i64, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc_test2, i64 0, i64 0)
+  // SECTION: %pgocount = load i64, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc_test2, i64 0, i64 0)
+  // EXCLUDE: %pgocount = load i64, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc_test2, i64 0, i64 0)
   return i - 1;
 }

From 07f8d437134c0b229104241a621db05013da0049 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20G=C3=B3rny?= <mgorny@moritz.systems>
Date: Fri, 29 Jan 2021 02:14:47 +0100
Subject: [PATCH 022/318] [clang-tidy] Fix linking tests to LLVMTestingSupport

LLVMTestingSupport is not part of libLLVM, and therefore can not
be linked to via LLVM_LINK_COMPONENTS.  Instead, it needs to be
specified explicitly to ensure that it is linked explicitly
even if LLVM_LINK_LLVM_DYLIB is used.  This is consistent with handling
in clangd.

Fixes PR#48931

Differential Revision: https://reviews.llvm.org/D95653

(cherry picked from commit 632545e8ce846ccaeca8df15a3dc5e36d01a1275)
---
 clang-tools-extra/unittests/clang-tidy/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang-tools-extra/unittests/clang-tidy/CMakeLists.txt b/clang-tools-extra/unittests/clang-tidy/CMakeLists.txt
index be35b71d15cf..05d330dd8033 100644
--- a/clang-tools-extra/unittests/clang-tidy/CMakeLists.txt
+++ b/clang-tools-extra/unittests/clang-tidy/CMakeLists.txt
@@ -1,7 +1,6 @@
 set(LLVM_LINK_COMPONENTS
   FrontendOpenMP
   Support
-  TestingSupport
   )
 
 get_filename_component(CLANG_LINT_SOURCE_DIR
@@ -46,4 +45,5 @@ target_link_libraries(ClangTidyTests
   clangTidyObjCModule
   clangTidyReadabilityModule
   clangTidyUtils
+  LLVMTestingSupport
   )

From f54cf61ad8e1cc6592074ddd7ad07908623ead6b Mon Sep 17 00:00:00 2001
From: Shilei Tian <tianshilei1992@gmail.com>
Date: Wed, 27 Jan 2021 17:06:05 -0500
Subject: [PATCH 023/318] [OpenMP][NVPTX] Disable building NVPTX deviceRTL by
 default on a non-CUDA system

D95466 dropped CUDA to build NVPTX deviceRTL and enabled it by default.
However, the building requires some libraries that are not available on non-CUDA
system by default, which could break the compilation. This patch disabled the
build by default. It can be enabled with `LIBOMPTARGET_BUILD_NVPTX_BCLIB=ON`.

Reviewed By: kparzysz

Differential Revision: https://reviews.llvm.org/D95556

(cherry picked from commit fb12df4a8e33d759938057718273dfb434b2d9c4)
---
 openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
index 4661bf08af1c..23efbba29d66 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -10,6 +10,15 @@
 #
 ##===----------------------------------------------------------------------===##
 
+# By default we will not build NVPTX deviceRTL on a non-CUDA
+set(LIBOMPTARGET_BUILD_NVPTX_BCLIB FALSE CACHE BOOL
+  "Whether build NVPTX deviceRTL on non-CUDA system.")
+
+if (NOT (LIBOMPTARGET_DEP_CUDA_FOUND OR LIBOMPTARGET_BUILD_NVPTX_BCLIB))
+  libomptarget_say("Not building NVPTX deviceRTL by default on non-CUDA system.")
+  return()
+endif()
+
 # Check if we can create an LLVM bitcode implementation of the runtime library
 # that could be inlined in the user application. For that we need to find
 # a Clang compiler capable of compiling our CUDA files to LLVM bitcode and

From 07dc51637cc419cbd61383eb4e26713a8f931806 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sat, 30 Jan 2021 13:30:48 +0000
Subject: [PATCH 024/318] [LoopUnswitch] Properly update MSSA if header has
 non-clobbering stores.

This patch fixes updating MemorySSA if the header contains memory
defs that do not clobber a duplicated instruction. We need to find the
first defining access outside the loop body and use that as defining
access of the duplicated instruction.

This fixes a crash caused by bee486851c1a.

(Cherry-picked on the 12.x release branch from
10c57268c074c3ad48f76da38fa2ba575ee3d1f9)
---
 llvm/lib/Transforms/Scalar/LoopUnswitch.cpp   | 10 ++-
 .../partial-unswitch-update-memoryssa.ll      | 76 +++++++++++++++++++
 .../LoopUnswitch/partial-unswitch.ll          | 36 ---------
 3 files changed, 83 insertions(+), 39 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopUnswitch/partial-unswitch-update-memoryssa.ll

diff --git a/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
index 18717394d384..822a786fc7c7 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -1114,12 +1114,16 @@ void LoopUnswitch::emitPreheaderBranchOnCondition(
 
         Loop *L = LI->getLoopFor(I->getParent());
         auto *DefiningAccess = MemA->getDefiningAccess();
-        // If the defining access is a MemoryPhi in the header, get the incoming
-        // value for the pre-header as defining access.
-        if (DefiningAccess->getBlock() == I->getParent()) {
+        // Get the first defining access before the loop.
+        while (L->contains(DefiningAccess->getBlock())) {
+          // If the defining access is a MemoryPhi, get the incoming
+          // value for the pre-header as defining access.
           if (auto *MemPhi = dyn_cast<MemoryPhi>(DefiningAccess)) {
             DefiningAccess =
                 MemPhi->getIncomingValueForBlock(L->getLoopPreheader());
+          } else {
+            DefiningAccess =
+                cast<MemoryDef>(DefiningAccess)->getDefiningAccess();
           }
         }
         MSSAU->createMemoryAccessInBB(New, DefiningAccess, New->getParent(),
diff --git a/llvm/test/Transforms/LoopUnswitch/partial-unswitch-update-memoryssa.ll b/llvm/test/Transforms/LoopUnswitch/partial-unswitch-update-memoryssa.ll
new file mode 100644
index 000000000000..ec1e8eeeb070
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnswitch/partial-unswitch-update-memoryssa.ll
@@ -0,0 +1,76 @@
+; RUN: opt -loop-unswitch -verify-dom-info -verify-memoryssa -S -enable-new-pm=0 %s | FileCheck %s
+; RUN: opt -loop-unswitch -memssa-check-limit=3 -verify-dom-info -verify-memoryssa -S -enable-new-pm=0 %s | FileCheck %s
+
+declare void @clobber()
+
+; Check that MemorySSA updating can deal with a clobbering access of a
+; duplicated load being a MemoryPHI outside the loop.
+define void @partial_unswitch_memssa_update(i32* noalias %ptr, i1 %c) {
+; CHECK-LABEL: @partial_unswitch_memssa_update(
+; CHECK-LABEL: loop.ph:
+; CHECK-NEXT:    [[LV:%[a-z0-9]+]] = load i32, i32* %ptr, align 4
+; CHECK-NEXT:    [[C:%[a-z0-9]+]] = icmp eq i32 [[LV]], 0
+; CHECK-NEXT:    br i1 [[C]]
+entry:
+  br i1 %c, label %loop.ph, label %outside.clobber
+
+outside.clobber:
+  call void @clobber()
+  br label %loop.ph
+
+loop.ph:
+  br label %loop.header
+
+loop.header:
+  %lv = load i32, i32* %ptr, align 4
+  %hc = icmp eq i32 %lv, 0
+  br i1 %hc, label %if, label %then
+
+if:
+  br label %loop.latch
+
+then:
+  br label %loop.latch
+
+loop.latch:
+  br i1 true, label %loop.header, label %exit
+
+exit:
+  ret void
+}
+
+; Check that MemorySSA updating can deal with skipping defining accesses in the
+; loop body until it finds the first defining access outside the loop.
+define void @partial_unswitch_inloop_stores_beteween_outside_defining_access(i64* noalias %ptr, i16* noalias %src) {
+; CHECK-LABEL: @partial_unswitch_inloop_stores_beteween_outside_defining_access
+; CHECK-LABEL: entry:
+; CHECK-NEXT:    store i64 0, i64* %ptr, align 1
+; CHECK-NEXT:    store i64 1, i64* %ptr, align 1
+; CHECK-NEXT:    [[LV:%[a-z0-9]+]] = load i16, i16* %src, align 1
+; CHECK-NEXT:    [[C:%[a-z0-9]+]] = icmp eq i16 [[LV]], 0
+; CHECK-NEXT:    br i1 [[C]]
+;
+entry:
+  store i64 0, i64* %ptr, align 1
+  store i64 1, i64* %ptr, align 1
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  store i64 2, i64* %ptr, align 1
+  %lv = load i16, i16* %src, align 1
+  %invar.cond = icmp eq i16 %lv, 0
+  br i1 %invar.cond, label %noclobber, label %loop.latch
+
+noclobber:
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv, 1000
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopUnswitch/partial-unswitch.ll b/llvm/test/Transforms/LoopUnswitch/partial-unswitch.ll
index 9f0e5d6f6c35..96a6b0f4e2b5 100644
--- a/llvm/test/Transforms/LoopUnswitch/partial-unswitch.ll
+++ b/llvm/test/Transforms/LoopUnswitch/partial-unswitch.ll
@@ -575,42 +575,6 @@ exit:
   ret i32 10
 }
 
-; Check that MemorySSA updating can deal with a clobbering access of a
-; duplicated load being a MemoryPHI outside the loop.
-define void @partial_unswitch_memssa_update(i32* noalias %ptr, i1 %c) {
-; CHECK-LABEL: @partial_unswitch_memssa_update(
-; CHECK-LABEL: loop.ph:
-; CHECK-NEXT:    [[LV:%[a-z0-9]+]] = load i32, i32* %ptr, align 4
-; CHECK-NEXT:    [[C:%[a-z0-9]+]] = icmp eq i32 [[LV]], 0
-; CHECK-NEXT:    br i1 [[C]]
-entry:
-  br i1 %c, label %loop.ph, label %outside.clobber
-
-outside.clobber:
-  call void @clobber()
-  br label %loop.ph
-
-loop.ph:
-  br label %loop.header
-
-loop.header:
-  %lv = load i32, i32* %ptr, align 4
-  %hc = icmp eq i32 %lv, 0
-  br i1 %hc, label %if, label %then
-
-if:
-  br label %loop.latch
-
-then:
-  br label %loop.latch
-
-loop.latch:
-  br i1 true, label %loop.header, label %exit
-
-exit:
-  ret void
-}
-
 ; Make sure the duplicated instructions are moved to a preheader that always
 ; executes when the loop body also executes. Do not check the unswitched code,
 ; because it is already checked in the @partial_unswitch_true_successor test

From c5fd87eaddaad87b28530e5272b7cf0c788dc1f9 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Wed, 27 Jan 2021 03:09:20 +0000
Subject: [PATCH 025/318] workflows: Fix LLVM ABI checks to work for X.0.0
 releases

---
 .github/workflows/llvm-tests.yml | 84 +++++++++++++++++++++++++++-----
 1 file changed, 72 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/llvm-tests.yml b/.github/workflows/llvm-tests.yml
index 675383407d64..1cffc3ef4d97 100644
--- a/.github/workflows/llvm-tests.yml
+++ b/.github/workflows/llvm-tests.yml
@@ -1,8 +1,5 @@
 name: LLVM Tests
 
-env:
-  release_major: 12
-
 on:
   push:
     branches:
@@ -42,7 +39,38 @@ jobs:
       with:
         cmake_args: -G Ninja -DCMAKE_BUILD_TYPE=Release
 
+  abi-dump-setup:
+    runs-on: ubuntu-latest
+    outputs:
+      BASELINE_REF: ${{ steps.vars.outputs.BASELINE_REF }}
+      ABI_HEADERS: ${{ steps.vars.outputs.ABI_HEADERS }}
+      BASELINE_VERSION_MAJOR: ${{ steps.vars.outputs.BASELINE_VERSION_MAJOR }}
+      LLVM_VERSION_MAJOR: ${{ steps.version.outputs.LLVM_VERSION_MAJOR }}
+      LLVM_VERSION_MINOR: ${{ steps.version.outputs.LLVM_VERSION_MINOR }}
+      LLVM_VERSION_PATCH: ${{ steps.version.outputs.LLVM_VERSION_PATCH }}
+    steps:
+      - name: Checkout source
+        uses: actions/checkout@v1
+        with:
+          fetch-depth: 1
+
+      - name: Get LLVM version
+        id: version
+        uses: tstellar/actions/get-llvm-version@get-version
+
+      - name: Setup Variables
+        id: vars
+        run: |
+          if [ ${{ steps.version.outputs.LLVM_VERSION_MINOR }} -ne 0 -o ${{ steps.version.outputs.LLVM_VERSION_PATCH }} -eq 0 ]; then
+            echo ::set-output name=BASELINE_VERSION_MAJOR::$(( ${{ steps.version.outputs.LLVM_VERSION_MAJOR }} - 1))
+            echo ::set-output name=ABI_HEADERS::llvm-c
+          else
+            echo ::set-output name=BASELINE_VERSION_MAJOR::${{ steps.version.outputs.LLVM_VERSION_MAJOR }}
+            echo ::set-output name=ABI_HEADERS::.
+          fi
+
   abi-dump:
+    needs: abi-dump-setup
     runs-on: ubuntu-latest
     strategy:
       matrix:
@@ -51,11 +79,11 @@ jobs:
           - build-latest
         include:
           - name: build-baseline
-            # FIXME: Referencing the env context does not work here
-            # ref: llvmorg-${{ env.release_major }}.0.0
-            ref: llvmorg-12.0.0
+            llvm_version_major: ${{ needs.abi-dump-setup.outputs.BASELINE_VERSION_MAJOR }}
+            ref: llvmorg-${{ needs.abi-dump-setup.outputs.BASELINE_VERSION_MAJOR }}.0.0
             repo: llvm/llvm-project
           - name: build-latest
+            llvm_version_major: ${{ needs.abi-dump-setup.outputs.LLVM_VERSION_MAJOR }}
             ref: ${{ github.sha }}
             repo: ${{ github.repository }}
     steps:
@@ -78,22 +106,44 @@ jobs:
         repo: ${{ matrix.repo }}
     - name: Configure
       run: |
-        mkdir build
-        cd build
-        cmake -G Ninja -DCMAKE_BUILD_TYPE=Debug -DLLVM_TARGETS_TO_BUILD="" -DLLVM_BUILD_LLVM_DYLIB=ON -DCMAKE_C_FLAGS_DEBUG="-g -Og" -DCMAKE_CXX_FLAGS_DEBUG="-g -Og" ../llvm
+        mkdir install
+        cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=Debug -DLLVM_TARGETS_TO_BUILD="" -DLLVM_BUILD_LLVM_DYLIB=ON -DCMAKE_C_FLAGS_DEBUG="-g -Og" -DCMAKE_CXX_FLAGS_DEBUG="-g -Og" -DCMAKE_INSTALL_PREFIX=`pwd`/install llvm
     - name: Build
-      run: ninja -C build libLLVM-${{ env.release_major }}.so
+      # Need to run install-LLVM twice to ensure the symlink is installed (this is a bug).
+      run: |
+        ninja -C build install-LLVM
+        ninja -C build install-LLVM
+        ninja -C build install-llvm-headers
     - name: Dump ABI
-      run: abi-dumper -lver ${{ matrix.ref }} -skip-cxx -public-headers llvm/include -o ${{ matrix.ref }}.abi.tar.gz build/lib/libLLVM-${{ env.release_major }}.so
+      run: |
+        if [ "${{ needs.abi-dump-setup.outputs.ABI_HEADERS }}" = "llvm-c" ]; then
+          nm ./install/lib/libLLVM.so | awk "/T _LLVM/ || /T LLVM/ { print $3 }" | sort -u | sed -e "s/^_//g" | cut -d ' ' -f 3 > llvm.symbols
+          # Even though the -symbols-list option doesn't seem to filter out the symbols, I believe it speeds up processing, so I'm leaving it in.
+          export EXTRA_ARGS="-symbols-list llvm.symbols"
+        else
+          touch llvm.symbols
+        fi
+        abi-dumper $EXTRA_ARGS -lver ${{ matrix.ref }} -skip-cxx -public-headers ./install/include/${{ needs.abi-dump-setup.outputs.ABI_HEADERS }} -o ${{ matrix.ref }}.abi ./install/lib/libLLVM.so
+        # Remove symbol versioning from dumps, so we can compare across major versions.
+        sed -i 's/LLVM_${{ matrix.llvm_version_major }}/LLVM_NOVERSION/' ${{ matrix.ref }}.abi
+        tar -czf ${{ matrix.ref }}.abi.tar.gz ${{ matrix.ref }}.abi
     - name: Upload ABI file
       uses: actions/upload-artifact@v1
       with:
         name: ${{ matrix.name }}
         path: ${{ matrix.ref }}.abi.tar.gz
 
+    - name: Upload symbol list file
+      if: matrix.name == 'build-baseline'
+      uses: actions/upload-artifact@v1
+      with:
+        name: symbol-list
+        path: llvm.symbols
+
   abi-compare:
     runs-on: ubuntu-latest
     needs:
+      - abi-dump-setup
       - abi-dump
     steps:
       - name: Download baseline
@@ -104,10 +154,20 @@ jobs:
         uses: actions/download-artifact@v1
         with:
           name: build-latest
+      - name: Download symbol list
+        uses: actions/download-artifact@v1
+        with:
+          name: symbol-list
+
       - name: Install abi-compliance-checker
         run: sudo apt-get install abi-compliance-checker
       - name: Compare ABI
-        run: abi-compliance-checker -l libLLVM-${{ env.release_major}}.so -old build-baseline/*.tar.gz -new build-latest/*.tar.gz
+        run: |
+          if [ -s symbol-list/llvm.symbols ]; then
+            # This option doesn't seem to work with the ABI dumper, so passing it here.
+            export EXTRA_ARGS="-symbols-list symbol-list/llvm.symbols"
+          fi
+          abi-compliance-checker $EXTRA_ARGS -l libLLVM.so -old build-baseline/*.tar.gz -new build-latest/*.tar.gz || test "${{ needs.abi-dump-setup.outputs.ABI_HEADERS }}" = "llvm-c"
       - name: Upload ABI Comparison
         if: always()
         uses: actions/upload-artifact@v1

From b6d2402e319be00592908b2c9cb63fccdb481008 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 2 Feb 2021 15:08:17 +0200
Subject: [PATCH 026/318] [docs] Add release notes for things I've done for the
 12.x release branch.

---
 clang/docs/ReleaseNotes.rst |  3 +++
 lld/docs/ReleaseNotes.rst   | 20 ++++++++++++++++++--
 llvm/docs/ReleaseNotes.rst  | 19 +++++++++++++++++++
 3 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 3001d6feb631..a34cd512ca59 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -150,6 +150,9 @@ Attribute Changes in Clang
 Windows Support
 ---------------
 
+- Implicitly add ``.exe`` suffix for MinGW targets, even when cross compiling.
+  (This matches a change from GCC 8.)
+
 C Language Changes in Clang
 ---------------------------
 
diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index e0b17ca3e030..ea1403888eba 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -35,12 +35,28 @@ Breaking changes
 COFF Improvements
 -----------------
 
-* ...
+* Error out clearly if creating a DLL with too many exported symbols.
+  (`D86701 <https://reviews.llvm.org/D86701>`_)
 
 MinGW Improvements
 ------------------
 
-* ...
+* Enabled dynamicbase by default. (`D86654 <https://reviews.llvm.org/D86654>`_)
+
+* Tolerate mismatches between COMDAT section sizes with different amount of
+  padding (produced by binutils) by inspecting the aux section definition.
+  (`D86659 <https://reviews.llvm.org/D86659>`_)
+
+* Support setting the subsystem version via the subsystem argument.
+  (`D88804 <https://reviews.llvm.org/D88804>`_)
+
+* Implemented the GNU -wrap option.
+  (`D89004 <https://reviews.llvm.org/D89004>`_,
+  `D91689 <https://reviews.llvm.org/D91689>`_)
+
+* Handle the ``--demangle`` and ``--no-demangle`` options.
+  (`D93950 <https://reviews.llvm.org/D93950>`_)
+
 
 MachO Improvements
 ------------------
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index de8431fe3908..f2eb53778406 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -92,6 +92,25 @@ Changes to TableGen
   uses the "`...`" range punctuation (e.g., ``{0...9}``). The hyphen syntax
   is deprecated.
 
+Changes to the AArch64 Backend
+--------------------------
+
+During this release ...
+
+* Lots of improvements to generation of Windows unwind data; the unwind
+  data is optimized and written in packed form where possible, reducing
+  the size of unwind data (pdata and xdata sections) by around 60%
+  compared with LLVM 11. The generation of prologs/epilogs is tweaked
+  when targeting Windows, to increase the chances of being able to use
+  the packed unwind info format.
+
+* Support for creating Windows unwind data using ``.seh_*`` assembler
+  directives.
+
+* Produce proper assembly output for the Windows target, including
+  ``:lo12:`` relocation specifiers, to allow the assembly output
+  to actually be assembled.
+
 Changes to the ARM Backend
 --------------------------
 

From 0db882a0f59afcd7f76d716ca2e04f2d6d92aa03 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Mon, 1 Feb 2021 10:48:29 -0800
Subject: [PATCH 027/318] workflows: Fix libclc tests

---
 .github/workflows/libclc-tests.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/libclc-tests.yml b/.github/workflows/libclc-tests.yml
index 2f1eb2939ea2..188eecfc3b89 100644
--- a/.github/workflows/libclc-tests.yml
+++ b/.github/workflows/libclc-tests.yml
@@ -45,9 +45,9 @@ jobs:
         cmake_args: -G Ninja  -DLLVM_ENABLE_PROJECTS="clang" -DCMAKE_BUILD_TYPE=Release
         build_target: ""
     - name: Build and test libclc
+      # spirv targets require llvm-spirv, so skip building them until we figure out
+      # how to install this tool.
       run: |
-        mkdir libclc-build
-        cd libclc-build
-        cmake -G Ninja ../libclc -DLLVM_CONFIG=../build/bin/llvm-config
-        ninja
-        ninja test
+        cmake -G Ninja -S libclc -B libclc-build -DLLVM_CONFIG=`pwd`/build/bin/llvm-config -DLIBCLC_TARGETS_TO_BUILD="amdgcn--;amdgcn--amdhsa;r600--;nvptx--;nvptx64--;nvptx--nvidiacl;nvptx64--nvidiacl"
+        ninja -C libclc-build
+        ninja -C libclc-build test

From c0097c784179e6f927ed8ae6b28796faee2fea61 Mon Sep 17 00:00:00 2001
From: Atmn Patel <atmndp@gmail.com>
Date: Sun, 31 Jan 2021 19:18:41 -0500
Subject: [PATCH 028/318] [OpenMP][Libomptarget] Remove possible harmful copy
 constructor call for RTLsTy

From https://bugs.llvm.org/show_bug.cgi?id=48973, we know that
`std::call_once(PM->RTLs.initFlag, &RTLsTy::LoadRTLs, PM->RTLs)` causes compile
time problems in libstdc++v3 5.3.1. This is because there was a defect in the
standard regarding the `call_once` (LWG 2442). This was fixed in libstdc++ soon
thereafter, but there are likely other standard libraries where this will fail.

By matching this function call with the other one, we fix this bug.

Differential Revision: https://reviews.llvm.org/D95769
---
 openmp/libomptarget/src/interface.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index 239570935cb2..cf6d36960c75 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -94,7 +94,7 @@ EXTERN void __tgt_register_requires(int64_t flags) {
 /// adds a target shared library to the target execution image
 EXTERN void __tgt_register_lib(__tgt_bin_desc *desc) {
   TIMESCOPE();
-  std::call_once(PM->RTLs.initFlag, &RTLsTy::LoadRTLs, PM->RTLs);
+  std::call_once(PM->RTLs.initFlag, &RTLsTy::LoadRTLs, &PM->RTLs);
   for (auto &RTL : PM->RTLs.AllRTLs) {
     if (RTL.register_lib) {
       if ((*RTL.register_lib)(desc) != OFFLOAD_SUCCESS) {

From 162642bec0df760b27e66cfff046b40f1dfd2713 Mon Sep 17 00:00:00 2001
From: Juneyoung Lee <aqjune@gmail.com>
Date: Thu, 4 Feb 2021 00:07:04 +0900
Subject: [PATCH 029/318] Revert "[ConstantFold] Fold more operations to
 poison"

This reverts commit 53040a968dc2ff20931661e55f05da2ef8b964a0 due to its
bad interaction with select i1 -> and/or i1 transformation.

This fixes:
https://bugs.llvm.org/show_bug.cgi?id=49005
https://bugs.llvm.org/show_bug.cgi?id=48435

(cherry picked from commit 06829034ca64b8c83a5b20d8abe5ddbfe7af0004)
---
 clang/test/Frontend/fixed_point_unary.c       |  4 +-
 llvm/lib/IR/ConstantFold.cpp                  | 59 ++++++-------
 ...amdgpu-codegenprepare-fold-binop-select.ll |  2 +-
 .../Transforms/InstCombine/apint-shift.ll     |  2 +-
 .../canonicalize-ashr-shl-to-masking.ll       |  2 +-
 .../canonicalize-lshr-shl-to-masking.ll       |  2 +-
 .../canonicalize-shl-lshr-to-masking.ll       |  2 +-
 llvm/test/Transforms/InstCombine/icmp.ll      |  4 +-
 ...nput-masking-after-truncation-variant-a.ll |  4 +-
 ...nput-masking-after-truncation-variant-b.ll |  4 +-
 ...nput-masking-after-truncation-variant-c.ll |  4 +-
 ...nput-masking-after-truncation-variant-d.ll |  4 +-
 ...nput-masking-after-truncation-variant-e.ll |  4 +-
 ...dant-left-shift-input-masking-variant-a.ll |  4 +-
 ...dant-left-shift-input-masking-variant-b.ll |  4 +-
 ...dant-left-shift-input-masking-variant-c.ll |  4 +-
 ...dant-left-shift-input-masking-variant-d.ll |  4 +-
 ...dant-left-shift-input-masking-variant-e.ll |  4 +-
 .../InstCombine/select-of-bittest.ll          |  6 +-
 .../InstCombine/shift-add-inseltpoison.ll     | 12 +--
 llvm/test/Transforms/InstCombine/shift-add.ll | 12 +--
 .../ConstProp/InsertElement-inseltpoison.ll   |  2 +-
 .../InstSimplify/ConstProp/InsertElement.ll   |  2 +-
 .../Transforms/InstSimplify/ConstProp/cast.ll |  4 +-
 .../InstSimplify/ConstProp/poison.ll          |  4 +-
 .../InstSimplify/ConstProp/shift.ll           | 24 ++---
 .../vector-undef-elts-inseltpoison.ll         |  2 +-
 .../ConstProp/vector-undef-elts.ll            |  2 +-
 .../ConstProp/vscale-inseltpoison.ll          | 16 ++--
 .../InstSimplify/ConstProp/vscale.ll          | 16 ++--
 llvm/test/Transforms/InstSimplify/div.ll      | 39 +--------
 llvm/test/Transforms/InstSimplify/rem.ll      | 31 +------
 llvm/test/Transforms/InstSimplify/undef.ll    | 87 +++++++++----------
 llvm/test/Transforms/SROA/phi-gep.ll          |  2 +-
 llvm/test/Transforms/SROA/select-gep.ll       |  2 +-
 .../X86/insert-binop-inseltpoison.ll          |  4 +-
 .../X86/insert-binop-with-constant.ll         | 42 ++++-----
 .../VectorCombine/X86/insert-binop.ll         |  6 +-
 llvm/unittests/IR/ConstantsTest.cpp           | 25 +++---
 39 files changed, 199 insertions(+), 258 deletions(-)

diff --git a/clang/test/Frontend/fixed_point_unary.c b/clang/test/Frontend/fixed_point_unary.c
index 6ce760daba11..849e38a94bc4 100644
--- a/clang/test/Frontend/fixed_point_unary.c
+++ b/clang/test/Frontend/fixed_point_unary.c
@@ -90,7 +90,7 @@ void inc_usa() {
 // SIGNED-LABEL: @inc_uf(
 // SIGNED-NEXT:  entry:
 // SIGNED-NEXT:    [[TMP0:%.*]] = load i16, i16* @uf, align 2
-// SIGNED-NEXT:    [[TMP1:%.*]] = add i16 [[TMP0]], poison
+// SIGNED-NEXT:    [[TMP1:%.*]] = add i16 [[TMP0]], undef
 // SIGNED-NEXT:    store i16 [[TMP1]], i16* @uf, align 2
 // SIGNED-NEXT:    ret void
 //
@@ -271,7 +271,7 @@ void dec_usa() {
 // SIGNED-LABEL: @dec_uf(
 // SIGNED-NEXT:  entry:
 // SIGNED-NEXT:    [[TMP0:%.*]] = load i16, i16* @uf, align 2
-// SIGNED-NEXT:    [[TMP1:%.*]] = sub i16 [[TMP0]], poison
+// SIGNED-NEXT:    [[TMP1:%.*]] = sub i16 [[TMP0]], undef
 // SIGNED-NEXT:    store i16 [[TMP1]], i16* @uf, align 2
 // SIGNED-NEXT:    ret void
 //
diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index 03cb108cc485..95dd55237e5f 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -630,7 +630,7 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
           V.convertToInteger(IntVal, APFloat::rmTowardZero, &ignored)) {
         // Undefined behavior invoked - the destination type can't represent
         // the input constant.
-        return PoisonValue::get(DestTy);
+        return UndefValue::get(DestTy);
       }
       return ConstantInt::get(FPC->getContext(), IntVal);
     }
@@ -916,7 +916,7 @@ Constant *llvm::ConstantFoldInsertElementInstruction(Constant *Val,
 
   unsigned NumElts = ValTy->getNumElements();
   if (CIdx->uge(NumElts))
-    return PoisonValue::get(Val->getType());
+    return UndefValue::get(Val->getType());
 
   SmallVector<Constant*, 16> Result;
   Result.reserve(NumElts);
@@ -1151,21 +1151,23 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1,
     }
     case Instruction::SDiv:
     case Instruction::UDiv:
-      // X / undef -> poison
-      // X / 0 -> poison
-      if (match(C2, m_CombineOr(m_Undef(), m_Zero())))
-        return PoisonValue::get(C2->getType());
+      // X / undef -> undef
+      if (isa<UndefValue>(C2))
+        return C2;
+      // undef / 0 -> undef
       // undef / 1 -> undef
-      if (match(C2, m_One()))
+      if (match(C2, m_Zero()) || match(C2, m_One()))
         return C1;
       // undef / X -> 0       otherwise
       return Constant::getNullValue(C1->getType());
     case Instruction::URem:
     case Instruction::SRem:
-      // X % undef -> poison
-      // X % 0 -> poison
-      if (match(C2, m_CombineOr(m_Undef(), m_Zero())))
-        return PoisonValue::get(C2->getType());
+      // X % undef -> undef
+      if (match(C2, m_Undef()))
+        return C2;
+      // undef % 0 -> undef
+      if (match(C2, m_Zero()))
+        return C1;
       // undef % X -> 0       otherwise
       return Constant::getNullValue(C1->getType());
     case Instruction::Or:                          // X | undef -> -1
@@ -1173,28 +1175,28 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1,
         return C1;
       return Constant::getAllOnesValue(C1->getType()); // undef | X -> ~0
     case Instruction::LShr:
-      // X >>l undef -> poison
+      // X >>l undef -> undef
       if (isa<UndefValue>(C2))
-        return PoisonValue::get(C2->getType());
+        return C2;
       // undef >>l 0 -> undef
       if (match(C2, m_Zero()))
         return C1;
       // undef >>l X -> 0
       return Constant::getNullValue(C1->getType());
     case Instruction::AShr:
-      // X >>a undef -> poison
+      // X >>a undef -> undef
       if (isa<UndefValue>(C2))
-        return PoisonValue::get(C2->getType());
+        return C2;
       // undef >>a 0 -> undef
       if (match(C2, m_Zero()))
         return C1;
-      // TODO: undef >>a X -> poison if the shift is exact
+      // TODO: undef >>a X -> undef if the shift is exact
       // undef >>a X -> 0
       return Constant::getNullValue(C1->getType());
     case Instruction::Shl:
       // X << undef -> undef
       if (isa<UndefValue>(C2))
-        return PoisonValue::get(C2->getType());
+        return C2;
       // undef << 0 -> undef
       if (match(C2, m_Zero()))
         return C1;
@@ -1247,14 +1249,14 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1,
       if (CI2->isOne())
         return C1;                                            // X / 1 == X
       if (CI2->isZero())
-        return PoisonValue::get(CI2->getType());              // X / 0 == poison
+        return UndefValue::get(CI2->getType());               // X / 0 == undef
       break;
     case Instruction::URem:
     case Instruction::SRem:
       if (CI2->isOne())
         return Constant::getNullValue(CI2->getType());        // X % 1 == 0
       if (CI2->isZero())
-        return PoisonValue::get(CI2->getType());              // X % 0 == poison
+        return UndefValue::get(CI2->getType());               // X % 0 == undef
       break;
     case Instruction::And:
       if (CI2->isZero()) return C2;                           // X & 0 == 0
@@ -1368,7 +1370,7 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1,
       case Instruction::SDiv:
         assert(!CI2->isZero() && "Div by zero handled above");
         if (C2V.isAllOnesValue() && C1V.isMinSignedValue())
-          return PoisonValue::get(CI1->getType());   // MIN_INT / -1 -> poison
+          return UndefValue::get(CI1->getType());   // MIN_INT / -1 -> undef
         return ConstantInt::get(CI1->getContext(), C1V.sdiv(C2V));
       case Instruction::URem:
         assert(!CI2->isZero() && "Div by zero handled above");
@@ -1376,7 +1378,7 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1,
       case Instruction::SRem:
         assert(!CI2->isZero() && "Div by zero handled above");
         if (C2V.isAllOnesValue() && C1V.isMinSignedValue())
-          return PoisonValue::get(CI1->getType());   // MIN_INT % -1 -> poison
+          return UndefValue::get(CI1->getType());   // MIN_INT % -1 -> undef
         return ConstantInt::get(CI1->getContext(), C1V.srem(C2V));
       case Instruction::And:
         return ConstantInt::get(CI1->getContext(), C1V & C2V);
@@ -1387,15 +1389,15 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1,
       case Instruction::Shl:
         if (C2V.ult(C1V.getBitWidth()))
           return ConstantInt::get(CI1->getContext(), C1V.shl(C2V));
-        return PoisonValue::get(C1->getType()); // too big shift is poison
+        return UndefValue::get(C1->getType()); // too big shift is undef
       case Instruction::LShr:
         if (C2V.ult(C1V.getBitWidth()))
           return ConstantInt::get(CI1->getContext(), C1V.lshr(C2V));
-        return PoisonValue::get(C1->getType()); // too big shift is poison
+        return UndefValue::get(C1->getType()); // too big shift is undef
       case Instruction::AShr:
         if (C2V.ult(C1V.getBitWidth()))
           return ConstantInt::get(CI1->getContext(), C1V.ashr(C2V));
-        return PoisonValue::get(C1->getType()); // too big shift is poison
+        return UndefValue::get(C1->getType()); // too big shift is undef
       }
     }
 
@@ -1441,7 +1443,7 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1,
     // Fast path for splatted constants.
     if (Constant *C2Splat = C2->getSplatValue()) {
       if (Instruction::isIntDivRem(Opcode) && C2Splat->isNullValue())
-        return PoisonValue::get(VTy);
+        return UndefValue::get(VTy);
       if (Constant *C1Splat = C1->getSplatValue()) {
         return ConstantVector::getSplat(
             VTy->getElementCount(),
@@ -1458,9 +1460,9 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1,
         Constant *LHS = ConstantExpr::getExtractElement(C1, ExtractIdx);
         Constant *RHS = ConstantExpr::getExtractElement(C2, ExtractIdx);
 
-        // If any element of a divisor vector is zero, the whole op is poison.
+        // If any element of a divisor vector is zero, the whole op is undef.
         if (Instruction::isIntDivRem(Opcode) && RHS->isNullValue())
-          return PoisonValue::get(VTy);
+          return UndefValue::get(VTy);
 
         Result.push_back(ConstantExpr::get(Opcode, LHS, RHS));
       }
@@ -2343,8 +2345,7 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
     return PoisonValue::get(GEPTy);
 
   if (isa<UndefValue>(C))
-    // If inbounds, we can choose an out-of-bounds pointer as a base pointer.
-    return InBounds ? PoisonValue::get(GEPTy) : UndefValue::get(GEPTy);
+    return UndefValue::get(GEPTy);
 
   Constant *Idx0 = cast<Constant>(Idxs[0]);
   if (Idxs.size() == 1 && (Idx0->isNullValue() || isa<UndefValue>(Idx0)))
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
index e0037f0d8e45..bfe83c7a1285 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
@@ -42,7 +42,7 @@ define i32 @select_sdiv_rhs_const_i32(i1 %cond) {
 
 define <2 x i32> @select_sdiv_lhs_const_v2i32(i1 %cond) {
 ; IR-LABEL: @select_sdiv_lhs_const_v2i32(
-; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], <2 x i32> <i32 666, i32 poison>, <2 x i32> <i32 555, i32 1428>
+; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], <2 x i32> <i32 666, i32 undef>, <2 x i32> <i32 555, i32 1428>
 ; IR-NEXT:    ret <2 x i32> [[OP]]
 ;
 ; GCN-LABEL: select_sdiv_lhs_const_v2i32:
diff --git a/llvm/test/Transforms/InstCombine/apint-shift.ll b/llvm/test/Transforms/InstCombine/apint-shift.ll
index 908aeac0cea2..5a351efccfcc 100644
--- a/llvm/test/Transforms/InstCombine/apint-shift.ll
+++ b/llvm/test/Transforms/InstCombine/apint-shift.ll
@@ -337,7 +337,7 @@ define <2 x i1> @test16vec_nonuniform(<2 x i84> %X) {
 
 define <2 x i1> @test16vec_undef(<2 x i84> %X) {
 ; CHECK-LABEL: @test16vec_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i84> [[X:%.*]], <i84 16, i84 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i84> [[X:%.*]], <i84 16, i84 undef>
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ne <2 x i84> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-ashr-shl-to-masking.ll b/llvm/test/Transforms/InstCombine/canonicalize-ashr-shl-to-masking.ll
index 8d29372c3a72..ba0d32ee3768 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-ashr-shl-to-masking.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-ashr-shl-to-masking.ll
@@ -418,7 +418,7 @@ define <3 x i8> @positive_sameconst_vec_undef1(<3 x i8> %x) {
 
 define <3 x i8> @positive_sameconst_vec_undef2(<3 x i8> %x) {
 ; CHECK-LABEL: @positive_sameconst_vec_undef2(
-; CHECK-NEXT:    [[RET:%.*]] = and <3 x i8> [[X:%.*]], <i8 -8, i8 poison, i8 -8>
+; CHECK-NEXT:    [[RET:%.*]] = and <3 x i8> [[X:%.*]], <i8 -8, i8 undef, i8 -8>
 ; CHECK-NEXT:    ret <3 x i8> [[RET]]
 ;
   %tmp0 = ashr <3 x i8> %x, <i8 3, i8 undef, i8 3>
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-lshr-shl-to-masking.ll b/llvm/test/Transforms/InstCombine/canonicalize-lshr-shl-to-masking.ll
index 40bc4aaab21c..445f6406b3d2 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-lshr-shl-to-masking.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-lshr-shl-to-masking.ll
@@ -418,7 +418,7 @@ define <3 x i8> @positive_sameconst_vec_undef1(<3 x i8> %x) {
 
 define <3 x i8> @positive_sameconst_vec_undef2(<3 x i8> %x) {
 ; CHECK-LABEL: @positive_sameconst_vec_undef2(
-; CHECK-NEXT:    [[RET:%.*]] = and <3 x i8> [[X:%.*]], <i8 -8, i8 poison, i8 -8>
+; CHECK-NEXT:    [[RET:%.*]] = and <3 x i8> [[X:%.*]], <i8 -8, i8 undef, i8 -8>
 ; CHECK-NEXT:    ret <3 x i8> [[RET]]
 ;
   %tmp0 = lshr <3 x i8> %x, <i8 3, i8 undef, i8 3>
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-shl-lshr-to-masking.ll b/llvm/test/Transforms/InstCombine/canonicalize-shl-lshr-to-masking.ll
index 45aa22aa808f..9de0b337de28 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-shl-lshr-to-masking.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-shl-lshr-to-masking.ll
@@ -171,7 +171,7 @@ define <3 x i32> @positive_sameconst_vec_undef1(<3 x i32> %x) {
 
 define <3 x i32> @positive_sameconst_vec_undef2(<3 x i32> %x) {
 ; CHECK-LABEL: @positive_sameconst_vec_undef2(
-; CHECK-NEXT:    [[RET:%.*]] = and <3 x i32> [[X:%.*]], <i32 134217727, i32 poison, i32 134217727>
+; CHECK-NEXT:    [[RET:%.*]] = and <3 x i32> [[X:%.*]], <i32 134217727, i32 undef, i32 134217727>
 ; CHECK-NEXT:    ret <3 x i32> [[RET]]
 ;
   %tmp0 = shl <3 x i32> %x, <i32 5, i32 undef, i32 5>
diff --git a/llvm/test/Transforms/InstCombine/icmp.ll b/llvm/test/Transforms/InstCombine/icmp.ll
index b48466e678d8..5e6bed4e280f 100644
--- a/llvm/test/Transforms/InstCombine/icmp.ll
+++ b/llvm/test/Transforms/InstCombine/icmp.ll
@@ -2876,7 +2876,7 @@ define <2 x i1> @icmp_and_or_lshr_cst_vec_nonuniform(<2 x i32> %x) {
 
 define <2 x i1> @icmp_and_or_lshr_cst_vec_undef(<2 x i32> %x) {
 ; CHECK-LABEL: @icmp_and_or_lshr_cst_vec_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[X:%.*]], <i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[X:%.*]], <i32 3, i32 -1>
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[RET]]
 ;
@@ -2920,7 +2920,7 @@ define <2 x i1> @icmp_and_or_lshr_cst_vec_nonuniform_commute(<2 x i32> %xp) {
 define <2 x i1> @icmp_and_or_lshr_cst_vec_undef_commute(<2 x i32> %xp) {
 ; CHECK-LABEL: @icmp_and_or_lshr_cst_vec_undef_commute(
 ; CHECK-NEXT:    [[X:%.*]] = srem <2 x i32> [[XP:%.*]], <i32 42, i32 42>
-; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[X]], <i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[X]], <i32 3, i32 -1>
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[RET]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-a.ll b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-a.ll
index e49c381fcd16..89c16a0949e8 100644
--- a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-a.ll
+++ b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-a.ll
@@ -103,7 +103,7 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T4]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <8 x i64> [[X:%.*]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[TMP1]], [[T4]]
-; CHECK-NEXT:    [[T7:%.*]] = and <8 x i32> [[TMP2]], <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 poison, i32 2147483647>
+; CHECK-NEXT:    [[T7:%.*]] = and <8 x i32> [[TMP2]], <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 undef, i32 2147483647>
 ; CHECK-NEXT:    ret <8 x i32> [[T7]]
 ;
   %t0 = add <8 x i32> %nbits, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 undef, i32 -1>
@@ -138,7 +138,7 @@ define <8 x i32> @t3_vec_nonsplat(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T4]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <8 x i64> [[X:%.*]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[TMP1]], [[T4]]
-; CHECK-NEXT:    [[T7:%.*]] = and <8 x i32> [[TMP2]], <i32 poison, i32 0, i32 1, i32 2147483647, i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    [[T7:%.*]] = and <8 x i32> [[TMP2]], <i32 undef, i32 0, i32 1, i32 2147483647, i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    ret <8 x i32> [[T7]]
 ;
   %t0 = add <8 x i32> %nbits, <i32 -33, i32 -32, i32 -31, i32 -1, i32 0, i32 1, i32 31, i32 32>
diff --git a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-b.ll b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-b.ll
index 20f38deeb0d5..8aef637c6a74 100644
--- a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-b.ll
+++ b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-b.ll
@@ -103,7 +103,7 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T4]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <8 x i64> [[X:%.*]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[TMP1]], [[T4]]
-; CHECK-NEXT:    [[T7:%.*]] = and <8 x i32> [[TMP2]], <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 poison, i32 2147483647>
+; CHECK-NEXT:    [[T7:%.*]] = and <8 x i32> [[TMP2]], <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 undef, i32 2147483647>
 ; CHECK-NEXT:    ret <8 x i32> [[T7]]
 ;
   %t0 = add <8 x i32> %nbits, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 undef, i32 -1>
@@ -138,7 +138,7 @@ define <8 x i32> @t3_vec_nonsplat(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T4]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <8 x i64> [[X:%.*]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[TMP1]], [[T4]]
-; CHECK-NEXT:    [[T7:%.*]] = and <8 x i32> [[TMP2]], <i32 poison, i32 0, i32 1, i32 2147483647, i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    [[T7:%.*]] = and <8 x i32> [[TMP2]], <i32 undef, i32 0, i32 1, i32 2147483647, i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    ret <8 x i32> [[T7]]
 ;
   %t0 = add <8 x i32> %nbits, <i32 -33, i32 -32, i32 -31, i32 -1, i32 0, i32 1, i32 31, i32 32>
diff --git a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-c.ll b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-c.ll
index 562280391c5e..61f25e6ca0b1 100644
--- a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-c.ll
+++ b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-c.ll
@@ -83,7 +83,7 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T2]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <8 x i64> [[X:%.*]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[TMP1]], [[T2]]
-; CHECK-NEXT:    [[T5:%.*]] = and <8 x i32> [[TMP2]], <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 poison, i32 2147483647>
+; CHECK-NEXT:    [[T5:%.*]] = and <8 x i32> [[TMP2]], <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 undef, i32 2147483647>
 ; CHECK-NEXT:    ret <8 x i32> [[T5]]
 ;
   %t0 = zext <8 x i32> %nbits to <8 x i64>
@@ -110,7 +110,7 @@ define <8 x i32> @t3_vec_nonsplat(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T2]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <8 x i64> [[X:%.*]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[TMP1]], [[T2]]
-; CHECK-NEXT:    [[T5:%.*]] = and <8 x i32> [[TMP2]], <i32 poison, i32 1, i32 2147483647, i32 -1, i32 -1, i32 -1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[T5:%.*]] = and <8 x i32> [[TMP2]], <i32 undef, i32 1, i32 2147483647, i32 -1, i32 -1, i32 -1, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <8 x i32> [[T5]]
 ;
   %t0 = zext <8 x i32> %nbits to <8 x i64>
diff --git a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-d.ll b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-d.ll
index aa644e6264e4..077bb8296f3e 100644
--- a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-d.ll
+++ b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-d.ll
@@ -93,7 +93,7 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T3]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <8 x i64> [[X:%.*]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[TMP1]], [[T3]]
-; CHECK-NEXT:    [[T6:%.*]] = and <8 x i32> [[TMP2]], <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 poison, i32 2147483647>
+; CHECK-NEXT:    [[T6:%.*]] = and <8 x i32> [[TMP2]], <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 undef, i32 2147483647>
 ; CHECK-NEXT:    ret <8 x i32> [[T6]]
 ;
   %t0 = zext <8 x i32> %nbits to <8 x i64>
@@ -124,7 +124,7 @@ define <8 x i32> @t3_vec_nonsplat(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T3]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <8 x i64> [[X:%.*]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[TMP1]], [[T3]]
-; CHECK-NEXT:    [[T6:%.*]] = and <8 x i32> [[TMP2]], <i32 poison, i32 1, i32 2147483647, i32 -1, i32 -1, i32 -1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[T6:%.*]] = and <8 x i32> [[TMP2]], <i32 undef, i32 1, i32 2147483647, i32 -1, i32 -1, i32 -1, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <8 x i32> [[T6]]
 ;
   %t0 = zext <8 x i32> %nbits to <8 x i64>
diff --git a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-e.ll b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-e.ll
index f2aa2894e27a..961ea5e48416 100644
--- a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-e.ll
+++ b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-e.ll
@@ -83,7 +83,7 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T2]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <8 x i64> [[X]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[TMP1]], [[T2]]
-; CHECK-NEXT:    [[T5:%.*]] = and <8 x i32> [[TMP2]], <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 poison, i32 2147483647>
+; CHECK-NEXT:    [[T5:%.*]] = and <8 x i32> [[TMP2]], <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 undef, i32 2147483647>
 ; CHECK-NEXT:    ret <8 x i32> [[T5]]
 ;
   %t0 = zext <8 x i32> %nbits to <8 x i64>
@@ -110,7 +110,7 @@ define <8 x i32> @t3_vec_nonsplat(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T2]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <8 x i64> [[X]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[TMP1]], [[T2]]
-; CHECK-NEXT:    [[T5:%.*]] = and <8 x i32> [[TMP2]], <i32 poison, i32 1, i32 2147483647, i32 -1, i32 -1, i32 -1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[T5:%.*]] = and <8 x i32> [[TMP2]], <i32 undef, i32 1, i32 2147483647, i32 -1, i32 -1, i32 -1, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <8 x i32> [[T5]]
 ;
   %t0 = zext <8 x i32> %nbits to <8 x i64>
diff --git a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-a.ll b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-a.ll
index 882117fe3480..41a71aa98f40 100644
--- a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-a.ll
+++ b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-a.ll
@@ -82,7 +82,7 @@ define <8 x i32> @t1_vec_splat_undef(<8 x i32> %x, <8 x i32> %nbits) {
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T2]])
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T4]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i32> [[X:%.*]], [[T4]]
-; CHECK-NEXT:    [[T5:%.*]] = and <8 x i32> [[TMP1]], <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 poison, i32 2147483647>
+; CHECK-NEXT:    [[T5:%.*]] = and <8 x i32> [[TMP1]], <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 undef, i32 2147483647>
 ; CHECK-NEXT:    ret <8 x i32> [[T5]]
 ;
   %t0 = add <8 x i32> %nbits, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 undef, i32 -1>
@@ -109,7 +109,7 @@ define <8 x i32> @t2_vec_nonsplat(<8 x i32> %x, <8 x i32> %nbits) {
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T2]])
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T4]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i32> [[X:%.*]], [[T4]]
-; CHECK-NEXT:    [[T5:%.*]] = and <8 x i32> [[TMP1]], <i32 poison, i32 0, i32 1, i32 2147483647, i32 -1, i32 -1, i32 -1, i32 poison>
+; CHECK-NEXT:    [[T5:%.*]] = and <8 x i32> [[TMP1]], <i32 undef, i32 0, i32 1, i32 2147483647, i32 -1, i32 -1, i32 -1, i32 undef>
 ; CHECK-NEXT:    ret <8 x i32> [[T5]]
 ;
   %t0 = add <8 x i32> %nbits, <i32 -33, i32 -32, i32 -31, i32 -1, i32 0, i32 1, i32 31, i32 32>
diff --git a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-b.ll b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-b.ll
index e92875d79207..787135229148 100644
--- a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-b.ll
+++ b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-b.ll
@@ -82,7 +82,7 @@ define <8 x i32> @t1_vec_splat_undef(<8 x i32> %x, <8 x i32> %nbits) {
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T2]])
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T4]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i32> [[X:%.*]], [[T4]]
-; CHECK-NEXT:    [[T5:%.*]] = and <8 x i32> [[TMP1]], <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 poison, i32 2147483647>
+; CHECK-NEXT:    [[T5:%.*]] = and <8 x i32> [[TMP1]], <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 undef, i32 2147483647>
 ; CHECK-NEXT:    ret <8 x i32> [[T5]]
 ;
   %t0 = add <8 x i32> %nbits, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 undef, i32 -1>
@@ -109,7 +109,7 @@ define <8 x i32> @t2_vec_nonsplat(<8 x i32> %x, <8 x i32> %nbits) {
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T2]])
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T4]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i32> [[X:%.*]], [[T4]]
-; CHECK-NEXT:    [[T5:%.*]] = and <8 x i32> [[TMP1]], <i32 poison, i32 0, i32 1, i32 2147483647, i32 -1, i32 -1, i32 -1, i32 poison>
+; CHECK-NEXT:    [[T5:%.*]] = and <8 x i32> [[TMP1]], <i32 undef, i32 0, i32 1, i32 2147483647, i32 -1, i32 -1, i32 -1, i32 undef>
 ; CHECK-NEXT:    ret <8 x i32> [[T5]]
 ;
   %t0 = add <8 x i32> %nbits, <i32 -33, i32 -32, i32 -31, i32 -1, i32 0, i32 1, i32 31, i32 32>
diff --git a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-c.ll b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-c.ll
index b8066cef2b40..c0959d9e1ac6 100644
--- a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-c.ll
+++ b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-c.ll
@@ -62,7 +62,7 @@ define <8 x i32> @t1_vec_splat_undef(<8 x i32> %x, <8 x i32> %nbits) {
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T0]])
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T2]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i32> [[X:%.*]], [[T2]]
-; CHECK-NEXT:    [[T3:%.*]] = and <8 x i32> [[TMP1]], <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 poison, i32 2147483647>
+; CHECK-NEXT:    [[T3:%.*]] = and <8 x i32> [[TMP1]], <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 undef, i32 2147483647>
 ; CHECK-NEXT:    ret <8 x i32> [[T3]]
 ;
   %t0 = lshr <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 undef, i32 -1>, %nbits
@@ -81,7 +81,7 @@ define <8 x i32> @t1_vec_nonsplat(<8 x i32> %x, <8 x i32> %nbits) {
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T0]])
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T2]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i32> [[X:%.*]], [[T2]]
-; CHECK-NEXT:    [[T3:%.*]] = and <8 x i32> [[TMP1]], <i32 poison, i32 1, i32 2147483647, i32 -1, i32 -1, i32 -1, i32 -1, i32 poison>
+; CHECK-NEXT:    [[T3:%.*]] = and <8 x i32> [[TMP1]], <i32 undef, i32 1, i32 2147483647, i32 -1, i32 -1, i32 -1, i32 -1, i32 undef>
 ; CHECK-NEXT:    ret <8 x i32> [[T3]]
 ;
   %t0 = lshr <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, %nbits
diff --git a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-d.ll b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-d.ll
index 20b322c0b647..5e0f0be2b1ad 100644
--- a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-d.ll
+++ b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-d.ll
@@ -72,7 +72,7 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i32> %x, <8 x i32> %nbits) {
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T1]])
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T3]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i32> [[X:%.*]], [[T3]]
-; CHECK-NEXT:    [[T4:%.*]] = and <8 x i32> [[TMP1]], <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 poison, i32 2147483647>
+; CHECK-NEXT:    [[T4:%.*]] = and <8 x i32> [[TMP1]], <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 undef, i32 2147483647>
 ; CHECK-NEXT:    ret <8 x i32> [[T4]]
 ;
   %t0 = shl <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 undef, i32 -1>, %nbits
@@ -95,7 +95,7 @@ define <8 x i32> @t2_vec_nonsplat(<8 x i32> %x, <8 x i32> %nbits) {
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T1]])
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T3]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i32> [[X:%.*]], [[T3]]
-; CHECK-NEXT:    [[T4:%.*]] = and <8 x i32> [[TMP1]], <i32 poison, i32 1, i32 2147483647, i32 -1, i32 -1, i32 -1, i32 -1, i32 poison>
+; CHECK-NEXT:    [[T4:%.*]] = and <8 x i32> [[TMP1]], <i32 undef, i32 1, i32 2147483647, i32 -1, i32 -1, i32 -1, i32 -1, i32 undef>
 ; CHECK-NEXT:    ret <8 x i32> [[T4]]
 ;
   %t0 = shl <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, %nbits
diff --git a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-e.ll b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-e.ll
index 46f5b0c2f213..2e335f0083c1 100644
--- a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-e.ll
+++ b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-e.ll
@@ -62,7 +62,7 @@ define <8 x i32> @t1_vec_splat_undef(<8 x i32> %x, <8 x i32> %nbits) {
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T0]])
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T2]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i32> [[X]], [[T2]]
-; CHECK-NEXT:    [[T3:%.*]] = and <8 x i32> [[TMP1]], <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 poison, i32 2147483647>
+; CHECK-NEXT:    [[T3:%.*]] = and <8 x i32> [[TMP1]], <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 undef, i32 2147483647>
 ; CHECK-NEXT:    ret <8 x i32> [[T3]]
 ;
   %t0 = shl <8 x i32> %x, %nbits
@@ -81,7 +81,7 @@ define <8 x i32> @t1_vec_nonsplat(<8 x i32> %x, <8 x i32> %nbits) {
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T0]])
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T2]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i32> [[X]], [[T2]]
-; CHECK-NEXT:    [[T3:%.*]] = and <8 x i32> [[TMP1]], <i32 poison, i32 1, i32 2147483647, i32 -1, i32 -1, i32 -1, i32 -1, i32 poison>
+; CHECK-NEXT:    [[T3:%.*]] = and <8 x i32> [[TMP1]], <i32 undef, i32 1, i32 2147483647, i32 -1, i32 -1, i32 -1, i32 -1, i32 undef>
 ; CHECK-NEXT:    ret <8 x i32> [[T3]]
 ;
   %t0 = shl <8 x i32> %x, %nbits
diff --git a/llvm/test/Transforms/InstCombine/select-of-bittest.ll b/llvm/test/Transforms/InstCombine/select-of-bittest.ll
index c85bcba82e97..d9bef00b2f78 100644
--- a/llvm/test/Transforms/InstCombine/select-of-bittest.ll
+++ b/llvm/test/Transforms/InstCombine/select-of-bittest.ll
@@ -82,7 +82,7 @@ define <2 x i32> @and_lshr_and_vec_v2(<2 x i32> %arg) {
 
 define <3 x i32> @and_lshr_and_vec_undef(<3 x i32> %arg) {
 ; CHECK-LABEL: @and_lshr_and_vec_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = and <3 x i32> [[ARG:%.*]], <i32 3, i32 poison, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <3 x i32> [[ARG:%.*]], <i32 3, i32 undef, i32 3>
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <3 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext <3 x i1> [[TMP2]] to <3 x i32>
 ; CHECK-NEXT:    ret <3 x i32> [[TMP4]]
@@ -91,7 +91,6 @@ define <3 x i32> @and_lshr_and_vec_undef(<3 x i32> %arg) {
   %tmp1 = icmp eq <3 x i32> %tmp, <i32 0, i32 undef, i32 0>
   %tmp2 = lshr <3 x i32> %arg, <i32 1, i32 undef, i32 1>
   %tmp3 = and <3 x i32> %tmp2, <i32 1, i32 undef, i32 1>
-  ; The second element of %tmp4 is poison because it is (undef ? poison : undef).
   %tmp4 = select <3 x i1> %tmp1, <3 x i32> %tmp3, <3 x i32> <i32 1, i32 undef, i32 1>
   ret <3 x i32> %tmp4
 }
@@ -223,7 +222,7 @@ define <2 x i32> @f_var0_vec(<2 x i32> %arg, <2 x i32> %arg1) {
 
 define <3 x i32> @f_var0_vec_undef(<3 x i32> %arg, <3 x i32> %arg1) {
 ; CHECK-LABEL: @f_var0_vec_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = or <3 x i32> [[ARG1:%.*]], <i32 2, i32 poison, i32 2>
+; CHECK-NEXT:    [[TMP1:%.*]] = or <3 x i32> [[ARG1:%.*]], <i32 2, i32 undef, i32 2>
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <3 x i32> [[TMP1]], [[ARG:%.*]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <3 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = zext <3 x i1> [[TMP3]] to <3 x i32>
@@ -233,7 +232,6 @@ define <3 x i32> @f_var0_vec_undef(<3 x i32> %arg, <3 x i32> %arg1) {
   %tmp2 = icmp eq <3 x i32> %tmp, <i32 0, i32 undef, i32 0>
   %tmp3 = lshr <3 x i32> %arg, <i32 1, i32 undef, i32 1>
   %tmp4 = and <3 x i32> %tmp3, <i32 1, i32 undef, i32 1>
-  ; The second element of %tmp5 is poison because it is (undef ? poison : undef).
   %tmp5 = select <3 x i1> %tmp2, <3 x i32> %tmp4, <3 x i32> <i32 1, i32 undef, i32 1>
   ret <3 x i32> %tmp5
 }
diff --git a/llvm/test/Transforms/InstCombine/shift-add-inseltpoison.ll b/llvm/test/Transforms/InstCombine/shift-add-inseltpoison.ll
index 3232cdc49c0f..e968f13c40b0 100644
--- a/llvm/test/Transforms/InstCombine/shift-add-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/shift-add-inseltpoison.ll
@@ -40,7 +40,7 @@ define i32 @lshr_C1_add_A_C2_i32(i32 %A) {
 define <4 x i32> @shl_C1_add_A_C2_v4i32(<4 x i16> %A) {
 ; CHECK-LABEL: @shl_C1_add_A_C2_v4i32(
 ; CHECK-NEXT:    [[B:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32>
-; CHECK-NEXT:    [[D:%.*]] = shl <4 x i32> <i32 6, i32 4, i32 poison, i32 -458752>, [[B]]
+; CHECK-NEXT:    [[D:%.*]] = shl <4 x i32> <i32 6, i32 4, i32 undef, i32 -458752>, [[B]]
 ; CHECK-NEXT:    ret <4 x i32> [[D]]
 ;
   %B = zext <4 x i16> %A to <4 x i32>
@@ -52,7 +52,7 @@ define <4 x i32> @shl_C1_add_A_C2_v4i32(<4 x i16> %A) {
 define <4 x i32> @ashr_C1_add_A_C2_v4i32(<4 x i32> %A) {
 ; CHECK-LABEL: @ashr_C1_add_A_C2_v4i32(
 ; CHECK-NEXT:    [[B:%.*]] = and <4 x i32> [[A:%.*]], <i32 0, i32 15, i32 255, i32 65535>
-; CHECK-NEXT:    [[D:%.*]] = ashr <4 x i32> <i32 6, i32 1, i32 poison, i32 -1>, [[B]]
+; CHECK-NEXT:    [[D:%.*]] = ashr <4 x i32> <i32 6, i32 1, i32 undef, i32 -1>, [[B]]
 ; CHECK-NEXT:    ret <4 x i32> [[D]]
 ;
   %B = and <4 x i32> %A, <i32 0, i32 15, i32 255, i32 65535>
@@ -64,7 +64,7 @@ define <4 x i32> @ashr_C1_add_A_C2_v4i32(<4 x i32> %A) {
 define <4 x i32> @lshr_C1_add_A_C2_v4i32(<4 x i32> %A) {
 ; CHECK-LABEL: @lshr_C1_add_A_C2_v4i32(
 ; CHECK-NEXT:    [[B:%.*]] = and <4 x i32> [[A:%.*]], <i32 0, i32 15, i32 255, i32 65535>
-; CHECK-NEXT:    [[D:%.*]] = lshr <4 x i32> <i32 6, i32 1, i32 poison, i32 65535>, [[B]]
+; CHECK-NEXT:    [[D:%.*]] = lshr <4 x i32> <i32 6, i32 1, i32 undef, i32 65535>, [[B]]
 ; CHECK-NEXT:    ret <4 x i32> [[D]]
 ;
   %B = and <4 x i32> %A, <i32 0, i32 15, i32 255, i32 65535>
@@ -78,7 +78,7 @@ define <4 x i32> @shl_C1_add_A_C2_v4i32_splat(i16 %I) {
 ; CHECK-NEXT:    [[A:%.*]] = zext i16 [[I:%.*]] to i32
 ; CHECK-NEXT:    [[B:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0
 ; CHECK-NEXT:    [[C:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[E:%.*]] = shl <4 x i32> <i32 6, i32 4, i32 poison, i32 -458752>, [[C]]
+; CHECK-NEXT:    [[E:%.*]] = shl <4 x i32> <i32 6, i32 4, i32 undef, i32 -458752>, [[C]]
 ; CHECK-NEXT:    ret <4 x i32> [[E]]
 ;
   %A = zext i16 %I to i32
@@ -94,7 +94,7 @@ define <4 x i32> @ashr_C1_add_A_C2_v4i32_splat(i16 %I) {
 ; CHECK-NEXT:    [[A:%.*]] = zext i16 [[I:%.*]] to i32
 ; CHECK-NEXT:    [[B:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0
 ; CHECK-NEXT:    [[C:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[E:%.*]] = ashr <4 x i32> <i32 6, i32 1, i32 poison, i32 -1>, [[C]]
+; CHECK-NEXT:    [[E:%.*]] = ashr <4 x i32> <i32 6, i32 1, i32 undef, i32 -1>, [[C]]
 ; CHECK-NEXT:    ret <4 x i32> [[E]]
 ;
   %A = zext i16 %I to i32
@@ -110,7 +110,7 @@ define <4 x i32> @lshr_C1_add_A_C2_v4i32_splat(i16 %I) {
 ; CHECK-NEXT:    [[A:%.*]] = zext i16 [[I:%.*]] to i32
 ; CHECK-NEXT:    [[B:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0
 ; CHECK-NEXT:    [[C:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[E:%.*]] = lshr <4 x i32> <i32 6, i32 1, i32 poison, i32 65535>, [[C]]
+; CHECK-NEXT:    [[E:%.*]] = lshr <4 x i32> <i32 6, i32 1, i32 undef, i32 65535>, [[C]]
 ; CHECK-NEXT:    ret <4 x i32> [[E]]
 ;
   %A = zext i16 %I to i32
diff --git a/llvm/test/Transforms/InstCombine/shift-add.ll b/llvm/test/Transforms/InstCombine/shift-add.ll
index eea8b7a074d7..e227274f4930 100644
--- a/llvm/test/Transforms/InstCombine/shift-add.ll
+++ b/llvm/test/Transforms/InstCombine/shift-add.ll
@@ -40,7 +40,7 @@ define i32 @lshr_C1_add_A_C2_i32(i32 %A) {
 define <4 x i32> @shl_C1_add_A_C2_v4i32(<4 x i16> %A) {
 ; CHECK-LABEL: @shl_C1_add_A_C2_v4i32(
 ; CHECK-NEXT:    [[B:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32>
-; CHECK-NEXT:    [[D:%.*]] = shl <4 x i32> <i32 6, i32 4, i32 poison, i32 -458752>, [[B]]
+; CHECK-NEXT:    [[D:%.*]] = shl <4 x i32> <i32 6, i32 4, i32 undef, i32 -458752>, [[B]]
 ; CHECK-NEXT:    ret <4 x i32> [[D]]
 ;
   %B = zext <4 x i16> %A to <4 x i32>
@@ -52,7 +52,7 @@ define <4 x i32> @shl_C1_add_A_C2_v4i32(<4 x i16> %A) {
 define <4 x i32> @ashr_C1_add_A_C2_v4i32(<4 x i32> %A) {
 ; CHECK-LABEL: @ashr_C1_add_A_C2_v4i32(
 ; CHECK-NEXT:    [[B:%.*]] = and <4 x i32> [[A:%.*]], <i32 0, i32 15, i32 255, i32 65535>
-; CHECK-NEXT:    [[D:%.*]] = ashr <4 x i32> <i32 6, i32 1, i32 poison, i32 -1>, [[B]]
+; CHECK-NEXT:    [[D:%.*]] = ashr <4 x i32> <i32 6, i32 1, i32 undef, i32 -1>, [[B]]
 ; CHECK-NEXT:    ret <4 x i32> [[D]]
 ;
   %B = and <4 x i32> %A, <i32 0, i32 15, i32 255, i32 65535>
@@ -64,7 +64,7 @@ define <4 x i32> @ashr_C1_add_A_C2_v4i32(<4 x i32> %A) {
 define <4 x i32> @lshr_C1_add_A_C2_v4i32(<4 x i32> %A) {
 ; CHECK-LABEL: @lshr_C1_add_A_C2_v4i32(
 ; CHECK-NEXT:    [[B:%.*]] = and <4 x i32> [[A:%.*]], <i32 0, i32 15, i32 255, i32 65535>
-; CHECK-NEXT:    [[D:%.*]] = lshr <4 x i32> <i32 6, i32 1, i32 poison, i32 65535>, [[B]]
+; CHECK-NEXT:    [[D:%.*]] = lshr <4 x i32> <i32 6, i32 1, i32 undef, i32 65535>, [[B]]
 ; CHECK-NEXT:    ret <4 x i32> [[D]]
 ;
   %B = and <4 x i32> %A, <i32 0, i32 15, i32 255, i32 65535>
@@ -78,7 +78,7 @@ define <4 x i32> @shl_C1_add_A_C2_v4i32_splat(i16 %I) {
 ; CHECK-NEXT:    [[A:%.*]] = zext i16 [[I:%.*]] to i32
 ; CHECK-NEXT:    [[B:%.*]] = insertelement <4 x i32> undef, i32 [[A]], i32 0
 ; CHECK-NEXT:    [[C:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[E:%.*]] = shl <4 x i32> <i32 6, i32 4, i32 poison, i32 -458752>, [[C]]
+; CHECK-NEXT:    [[E:%.*]] = shl <4 x i32> <i32 6, i32 4, i32 undef, i32 -458752>, [[C]]
 ; CHECK-NEXT:    ret <4 x i32> [[E]]
 ;
   %A = zext i16 %I to i32
@@ -94,7 +94,7 @@ define <4 x i32> @ashr_C1_add_A_C2_v4i32_splat(i16 %I) {
 ; CHECK-NEXT:    [[A:%.*]] = zext i16 [[I:%.*]] to i32
 ; CHECK-NEXT:    [[B:%.*]] = insertelement <4 x i32> undef, i32 [[A]], i32 0
 ; CHECK-NEXT:    [[C:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[E:%.*]] = ashr <4 x i32> <i32 6, i32 1, i32 poison, i32 -1>, [[C]]
+; CHECK-NEXT:    [[E:%.*]] = ashr <4 x i32> <i32 6, i32 1, i32 undef, i32 -1>, [[C]]
 ; CHECK-NEXT:    ret <4 x i32> [[E]]
 ;
   %A = zext i16 %I to i32
@@ -110,7 +110,7 @@ define <4 x i32> @lshr_C1_add_A_C2_v4i32_splat(i16 %I) {
 ; CHECK-NEXT:    [[A:%.*]] = zext i16 [[I:%.*]] to i32
 ; CHECK-NEXT:    [[B:%.*]] = insertelement <4 x i32> undef, i32 [[A]], i32 0
 ; CHECK-NEXT:    [[C:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[E:%.*]] = lshr <4 x i32> <i32 6, i32 1, i32 poison, i32 65535>, [[C]]
+; CHECK-NEXT:    [[E:%.*]] = lshr <4 x i32> <i32 6, i32 1, i32 undef, i32 65535>, [[C]]
 ; CHECK-NEXT:    ret <4 x i32> [[E]]
 ;
   %A = zext i16 %I to i32
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/InsertElement-inseltpoison.ll b/llvm/test/Transforms/InstSimplify/ConstProp/InsertElement-inseltpoison.ll
index 54b862c8514a..197e7cc0ac75 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/InsertElement-inseltpoison.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/InsertElement-inseltpoison.ll
@@ -25,7 +25,7 @@ define <4 x i64> @insertelement() {
 
 define <4 x i64> @insertelement_undef() {
 ; CHECK-LABEL: @insertelement_undef(
-; CHECK-NEXT:    ret <4 x i64> poison
+; CHECK-NEXT:    ret <4 x i64> undef
 ;
   %vec1 = insertelement <4 x i64> poison, i64 -1, i32 0
   %vec2 = insertelement <4 x i64> %vec1, i64 -2, i32 1
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/InsertElement.ll b/llvm/test/Transforms/InstSimplify/ConstProp/InsertElement.ll
index 127c1692b5b8..a9a27a5df01f 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/InsertElement.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/InsertElement.ll
@@ -25,7 +25,7 @@ define <4 x i64> @insertelement() {
 
 define <4 x i64> @insertelement_undef() {
 ; CHECK-LABEL: @insertelement_undef(
-; CHECK-NEXT:    ret <4 x i64> poison
+; CHECK-NEXT:    ret <4 x i64> undef
 ;
   %vec1 = insertelement <4 x i64> undef, i64 -1, i32 0
   %vec2 = insertelement <4 x i64> %vec1, i64 -2, i32 1
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/cast.ll b/llvm/test/Transforms/InstSimplify/ConstProp/cast.ll
index 1136151f7157..adf5e4b68a1b 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/cast.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/cast.ll
@@ -5,7 +5,7 @@
 
 define i8 @overflow_fptosi() {
 ; CHECK-LABEL: @overflow_fptosi(
-; CHECK-NEXT:    ret i8 poison
+; CHECK-NEXT:    ret i8 undef
 ;
   %i = fptosi double 1.56e+02 to i8
   ret i8 %i
@@ -13,7 +13,7 @@ define i8 @overflow_fptosi() {
 
 define i8 @overflow_fptoui() {
 ; CHECK-LABEL: @overflow_fptoui(
-; CHECK-NEXT:    ret i8 poison
+; CHECK-NEXT:    ret i8 undef
 ;
   %i = fptoui double 2.56e+02 to i8
   ret i8 %i
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/poison.ll b/llvm/test/Transforms/InstSimplify/ConstProp/poison.ll
index f3fe29ff57ba..ea34bb4699e6 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/poison.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/poison.ll
@@ -104,14 +104,14 @@ define void @vec_aggr_ops() {
 
 define void @other_ops(i8 %x) {
 ; CHECK-LABEL: @other_ops(
-; CHECK-NEXT:    call void (...) @use(i1 poison, i1 poison, i8 poison, i8 poison, i8* poison, i8* poison)
+; CHECK-NEXT:    call void (...) @use(i1 poison, i1 poison, i8 poison, i8 poison, i8* poison)
 ; CHECK-NEXT:    ret void
 ;
   %i1 = icmp eq i8 poison, 1
   %i2 = fcmp oeq float poison, 1.0
   %i3 = select i1 poison, i8 1, i8 2
   %i4 = select i1 true, i8 poison, i8 %x
-  call void (...) @use(i1 %i1, i1 %i2, i8 %i3, i8 %i4, i8* getelementptr (i8, i8* poison, i64 1), i8* getelementptr inbounds (i8, i8* undef, i64 1))
+  call void (...) @use(i1 %i1, i1 %i2, i8 %i3, i8 %i4, i8* getelementptr (i8, i8* poison, i64 1))
   ret void
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/shift.ll b/llvm/test/Transforms/InstSimplify/ConstProp/shift.ll
index a7a60e562117..3e64513533ff 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/shift.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/shift.ll
@@ -3,15 +3,15 @@
 ; CHECK-LABEL: shift_undef_64
 define void @shift_undef_64(i64* %p) {
   %r1 = lshr i64 -1, 4294967296 ; 2^32
-  ; CHECK: store i64 poison
+  ; CHECK: store i64 undef
   store i64 %r1, i64* %p
 
   %r2 = ashr i64 -1, 4294967297 ; 2^32 + 1
-  ; CHECK: store i64 poison
+  ; CHECK: store i64 undef
   store i64 %r2, i64* %p
 
   %r3 = shl i64 -1, 4294967298 ; 2^32 + 2
-  ; CHECK: store i64 poison
+  ; CHECK: store i64 undef
   store i64 %r3, i64* %p
 
   ret void
@@ -20,15 +20,15 @@ define void @shift_undef_64(i64* %p) {
 ; CHECK-LABEL: shift_undef_65
 define void @shift_undef_65(i65* %p) {
   %r1 = lshr i65 2, 18446744073709551617
-  ; CHECK: store i65 poison
+  ; CHECK: store i65 undef
   store i65 %r1, i65* %p
 
   %r2 = ashr i65 4, 18446744073709551617
-  ; CHECK: store i65 poison
+  ; CHECK: store i65 undef
   store i65 %r2, i65* %p
 
   %r3 = shl i65 1, 18446744073709551617
-  ; CHECK: store i65 poison
+  ; CHECK: store i65 undef
   store i65 %r3, i65* %p
 
   ret void
@@ -37,15 +37,15 @@ define void @shift_undef_65(i65* %p) {
 ; CHECK-LABEL: shift_undef_256
 define void @shift_undef_256(i256* %p) {
   %r1 = lshr i256 2, 18446744073709551617
-  ; CHECK: store i256 poison
+  ; CHECK: store i256 undef
   store i256 %r1, i256* %p
 
   %r2 = ashr i256 4, 18446744073709551618
-  ; CHECK: store i256 poison
+  ; CHECK: store i256 undef
   store i256 %r2, i256* %p
 
   %r3 = shl i256 1, 18446744073709551619
-  ; CHECK: store i256 poison
+  ; CHECK: store i256 undef
   store i256 %r3, i256* %p
 
   ret void
@@ -54,15 +54,15 @@ define void @shift_undef_256(i256* %p) {
 ; CHECK-LABEL: shift_undef_511
 define void @shift_undef_511(i511* %p) {
   %r1 = lshr i511 -1, 1208925819614629174706276 ; 2^80 + 100
-  ; CHECK: store i511 poison
+  ; CHECK: store i511 undef
   store i511 %r1, i511* %p
 
   %r2 = ashr i511 -2, 1208925819614629174706200
-  ; CHECK: store i511 poison
+  ; CHECK: store i511 undef
   store i511 %r2, i511* %p
 
   %r3 = shl i511 -3, 1208925819614629174706180
-  ; CHECK: store i511 poison
+  ; CHECK: store i511 undef
   store i511 %r3, i511* %p
 
   ret void
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vector-undef-elts-inseltpoison.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vector-undef-elts-inseltpoison.ll
index 6ce03dd2e0f0..2762291d7954 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/vector-undef-elts-inseltpoison.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/vector-undef-elts-inseltpoison.ll
@@ -5,7 +5,7 @@
 
 define <3 x i8> @shl() {
 ; CHECK-LABEL: @shl(
-; CHECK-NEXT:    ret <3 x i8> <i8 poison, i8 0, i8 0>
+; CHECK-NEXT:    ret <3 x i8> <i8 undef, i8 0, i8 0>
 ;
   %c = shl <3 x i8> undef, <i8 undef, i8 4, i8 1>
   ret <3 x i8> %c
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vector-undef-elts.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vector-undef-elts.ll
index 99cc2527d12e..5d0f484bc3fd 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/vector-undef-elts.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/vector-undef-elts.ll
@@ -5,7 +5,7 @@
 
 define <3 x i8> @shl() {
 ; CHECK-LABEL: @shl(
-; CHECK-NEXT:    ret <3 x i8> <i8 poison, i8 0, i8 0>
+; CHECK-NEXT:    ret <3 x i8> <i8 undef, i8 0, i8 0>
 ;
   %c = shl <3 x i8> undef, <i8 undef, i8 4, i8 1>
   ret <3 x i8> %c
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vscale-inseltpoison.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vscale-inseltpoison.ll
index 9689887be69b..ee19e617748b 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/vscale-inseltpoison.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/vscale-inseltpoison.ll
@@ -75,7 +75,7 @@ define <vscale x 4 x float> @fmul() {
 
 define <vscale x 4 x i32> @udiv() {
 ; CHECK-LABEL: @udiv(
-; CHECK-NEXT:    ret <vscale x 4 x i32> poison
+; CHECK-NEXT:    ret <vscale x 4 x i32> undef
 ;
   %r = udiv <vscale x 4 x i32> undef, undef
   ret <vscale x 4 x i32> %r
@@ -83,7 +83,7 @@ define <vscale x 4 x i32> @udiv() {
 
 define <vscale x 4 x i32> @udiv_splat_zero() {
 ; CHECK-LABEL: @udiv_splat_zero(
-; CHECK-NEXT:    ret <vscale x 4 x i32> poison
+; CHECK-NEXT:    ret <vscale x 4 x i32> undef
 ;
   %r = udiv <vscale x 4 x i32> zeroinitializer, zeroinitializer
   ret <vscale x 4 x i32> %r
@@ -91,7 +91,7 @@ define <vscale x 4 x i32> @udiv_splat_zero() {
 
 define <vscale x 4 x i32> @sdiv() {
 ; CHECK-LABEL: @sdiv(
-; CHECK-NEXT:    ret <vscale x 4 x i32> poison
+; CHECK-NEXT:    ret <vscale x 4 x i32> undef
 ;
   %r = sdiv <vscale x 4 x i32> undef, undef
   ret <vscale x 4 x i32> %r
@@ -107,7 +107,7 @@ define <vscale x 4 x float> @fdiv() {
 
 define <vscale x 4 x i32> @urem() {
 ; CHECK-LABEL: @urem(
-; CHECK-NEXT:    ret <vscale x 4 x i32> poison
+; CHECK-NEXT:    ret <vscale x 4 x i32> undef
 ;
   %r = urem <vscale x 4 x i32> undef, undef
   ret <vscale x 4 x i32> %r
@@ -115,7 +115,7 @@ define <vscale x 4 x i32> @urem() {
 
 define <vscale x 4 x i32> @srem() {
 ; CHECK-LABEL: @srem(
-; CHECK-NEXT:    ret <vscale x 4 x i32> poison
+; CHECK-NEXT:    ret <vscale x 4 x i32> undef
 ;
   %r = srem <vscale x 4 x i32> undef, undef
   ret <vscale x 4 x i32> %r
@@ -135,7 +135,7 @@ define <vscale x 4 x float> @frem() {
 
 define <vscale x 4 x i32> @shl() {
 ; CHECK-LABEL: @shl(
-; CHECK-NEXT:    ret <vscale x 4 x i32> poison
+; CHECK-NEXT:    ret <vscale x 4 x i32> undef
 ;
   %r = shl <vscale x 4 x i32> undef, undef
   ret <vscale x 4 x i32> %r
@@ -143,7 +143,7 @@ define <vscale x 4 x i32> @shl() {
 
 define <vscale x 4 x i32> @lshr() {
 ; CHECK-LABEL: @lshr(
-; CHECK-NEXT:    ret <vscale x 4 x i32> poison
+; CHECK-NEXT:    ret <vscale x 4 x i32> undef
 ;
   %r = lshr <vscale x 4 x i32> undef, undef
   ret <vscale x 4 x i32> %r
@@ -151,7 +151,7 @@ define <vscale x 4 x i32> @lshr() {
 
 define <vscale x 4 x i32> @ashr() {
 ; CHECK-LABEL: @ashr(
-; CHECK-NEXT:    ret <vscale x 4 x i32> poison
+; CHECK-NEXT:    ret <vscale x 4 x i32> undef
 ;
   %r = ashr <vscale x 4 x i32> undef, undef
   ret <vscale x 4 x i32> %r
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vscale.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vscale.ll
index 048e8840ffd8..66e4c93e1968 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/vscale.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/vscale.ll
@@ -75,7 +75,7 @@ define <vscale x 4 x float> @fmul() {
 
 define <vscale x 4 x i32> @udiv() {
 ; CHECK-LABEL: @udiv(
-; CHECK-NEXT:    ret <vscale x 4 x i32> poison
+; CHECK-NEXT:    ret <vscale x 4 x i32> undef
 ;
   %r = udiv <vscale x 4 x i32> undef, undef
   ret <vscale x 4 x i32> %r
@@ -83,7 +83,7 @@ define <vscale x 4 x i32> @udiv() {
 
 define <vscale x 4 x i32> @udiv_splat_zero() {
 ; CHECK-LABEL: @udiv_splat_zero(
-; CHECK-NEXT:    ret <vscale x 4 x i32> poison
+; CHECK-NEXT:    ret <vscale x 4 x i32> undef
 ;
   %r = udiv <vscale x 4 x i32> zeroinitializer, zeroinitializer
   ret <vscale x 4 x i32> %r
@@ -91,7 +91,7 @@ define <vscale x 4 x i32> @udiv_splat_zero() {
 
 define <vscale x 4 x i32> @sdiv() {
 ; CHECK-LABEL: @sdiv(
-; CHECK-NEXT:    ret <vscale x 4 x i32> poison
+; CHECK-NEXT:    ret <vscale x 4 x i32> undef
 ;
   %r = sdiv <vscale x 4 x i32> undef, undef
   ret <vscale x 4 x i32> %r
@@ -107,7 +107,7 @@ define <vscale x 4 x float> @fdiv() {
 
 define <vscale x 4 x i32> @urem() {
 ; CHECK-LABEL: @urem(
-; CHECK-NEXT:    ret <vscale x 4 x i32> poison
+; CHECK-NEXT:    ret <vscale x 4 x i32> undef
 ;
   %r = urem <vscale x 4 x i32> undef, undef
   ret <vscale x 4 x i32> %r
@@ -115,7 +115,7 @@ define <vscale x 4 x i32> @urem() {
 
 define <vscale x 4 x i32> @srem() {
 ; CHECK-LABEL: @srem(
-; CHECK-NEXT:    ret <vscale x 4 x i32> poison
+; CHECK-NEXT:    ret <vscale x 4 x i32> undef
 ;
   %r = srem <vscale x 4 x i32> undef, undef
   ret <vscale x 4 x i32> %r
@@ -135,7 +135,7 @@ define <vscale x 4 x float> @frem() {
 
 define <vscale x 4 x i32> @shl() {
 ; CHECK-LABEL: @shl(
-; CHECK-NEXT:    ret <vscale x 4 x i32> poison
+; CHECK-NEXT:    ret <vscale x 4 x i32> undef
 ;
   %r = shl <vscale x 4 x i32> undef, undef
   ret <vscale x 4 x i32> %r
@@ -143,7 +143,7 @@ define <vscale x 4 x i32> @shl() {
 
 define <vscale x 4 x i32> @lshr() {
 ; CHECK-LABEL: @lshr(
-; CHECK-NEXT:    ret <vscale x 4 x i32> poison
+; CHECK-NEXT:    ret <vscale x 4 x i32> undef
 ;
   %r = lshr <vscale x 4 x i32> undef, undef
   ret <vscale x 4 x i32> %r
@@ -151,7 +151,7 @@ define <vscale x 4 x i32> @lshr() {
 
 define <vscale x 4 x i32> @ashr() {
 ; CHECK-LABEL: @ashr(
-; CHECK-NEXT:    ret <vscale x 4 x i32> poison
+; CHECK-NEXT:    ret <vscale x 4 x i32> undef
 ;
   %r = ashr <vscale x 4 x i32> undef, undef
   ret <vscale x 4 x i32> %r
diff --git a/llvm/test/Transforms/InstSimplify/div.ll b/llvm/test/Transforms/InstSimplify/div.ll
index 7c8efc27d3aa..5a3e6e8f7daa 100644
--- a/llvm/test/Transforms/InstSimplify/div.ll
+++ b/llvm/test/Transforms/InstSimplify/div.ll
@@ -25,11 +25,11 @@ define <2 x i32> @zero_dividend_vector_undef_elt(<2 x i32> %A) {
   ret <2 x i32> %B
 }
 
-; Division-by-zero is poison. UB in any vector lane means the whole op is poison.
+; Division-by-zero is undef. UB in any vector lane means the whole op is undef.
 
 define <2 x i8> @sdiv_zero_elt_vec_constfold(<2 x i8> %x) {
 ; CHECK-LABEL: @sdiv_zero_elt_vec_constfold(
-; CHECK-NEXT:    ret <2 x i8> poison
+; CHECK-NEXT:    ret <2 x i8> undef
 ;
   %div = sdiv <2 x i8> <i8 1, i8 2>, <i8 0, i8 -42>
   ret <2 x i8> %div
@@ -37,7 +37,7 @@ define <2 x i8> @sdiv_zero_elt_vec_constfold(<2 x i8> %x) {
 
 define <2 x i8> @udiv_zero_elt_vec_constfold(<2 x i8> %x) {
 ; CHECK-LABEL: @udiv_zero_elt_vec_constfold(
-; CHECK-NEXT:    ret <2 x i8> poison
+; CHECK-NEXT:    ret <2 x i8> undef
 ;
   %div = udiv <2 x i8> <i8 1, i8 2>, <i8 42, i8 0>
   ret <2 x i8> %div
@@ -193,37 +193,4 @@ define i32 @div1() {
   ret i32 %urem
 }
 
-define i8 @sdiv_minusone_divisor() {
-; CHECK-LABEL: @sdiv_minusone_divisor(
-; CHECK-NEXT:    ret i8 poison
-;
-  %v = sdiv i8 -128, -1
-  ret i8 %v
-}
-
-define i32 @poison(i32 %x) {
-; CHECK-LABEL: @poison(
-; CHECK-NEXT:    ret i32 poison
-;
-  %v = udiv i32 %x, poison
-  ret i32 %v
-}
-
-; TODO: this should be poison
-define i32 @poison2(i32 %x) {
-; CHECK-LABEL: @poison2(
-; CHECK-NEXT:    ret i32 0
-;
-  %v = udiv i32 poison, %x
-  ret i32 %v
-}
-
-define <2 x i32> @poison3(<2 x i32> %x) {
-; CHECK-LABEL: @poison3(
-; CHECK-NEXT:    ret <2 x i32> poison
-;
-  %v = udiv <2 x i32> %x, <i32 poison, i32 1>
-  ret <2 x i32> %v
-}
-
 !0 = !{i32 0, i32 3}
diff --git a/llvm/test/Transforms/InstSimplify/rem.ll b/llvm/test/Transforms/InstSimplify/rem.ll
index 6aaeb5c70d00..6ccb6474ce44 100644
--- a/llvm/test/Transforms/InstSimplify/rem.ll
+++ b/llvm/test/Transforms/InstSimplify/rem.ll
@@ -25,11 +25,11 @@ define <2 x i32> @zero_dividend_vector_undef_elt(<2 x i32> %A) {
   ret <2 x i32> %B
 }
 
-; Division-by-zero is poison. UB in any vector lane means the whole op is poison.
+; Division-by-zero is undef. UB in any vector lane means the whole op is undef.
 
 define <2 x i8> @srem_zero_elt_vec_constfold(<2 x i8> %x) {
 ; CHECK-LABEL: @srem_zero_elt_vec_constfold(
-; CHECK-NEXT:    ret <2 x i8> poison
+; CHECK-NEXT:    ret <2 x i8> undef
 ;
   %rem = srem <2 x i8> <i8 1, i8 2>, <i8 0, i8 -42>
   ret <2 x i8> %rem
@@ -37,7 +37,7 @@ define <2 x i8> @srem_zero_elt_vec_constfold(<2 x i8> %x) {
 
 define <2 x i8> @urem_zero_elt_vec_constfold(<2 x i8> %x) {
 ; CHECK-LABEL: @urem_zero_elt_vec_constfold(
-; CHECK-NEXT:    ret <2 x i8> poison
+; CHECK-NEXT:    ret <2 x i8> undef
 ;
   %rem = urem <2 x i8> <i8 1, i8 2>, <i8 42, i8 0>
   ret <2 x i8> %rem
@@ -325,28 +325,3 @@ define <2 x i32> @srem_with_sext_bool_divisor_vec(<2 x i1> %x, <2 x i32> %y) {
   ret <2 x i32> %r
 }
 
-define i8 @srem_minusone_divisor() {
-; CHECK-LABEL: @srem_minusone_divisor(
-; CHECK-NEXT:    ret i8 poison
-;
-  %v = srem i8 -128, -1
-  ret i8 %v
-}
-
-define i32 @poison(i32 %x) {
-; CHECK-LABEL: @poison(
-; CHECK-NEXT:    ret i32 poison
-;
-  %v = urem i32 %x, poison
-  ret i32 %v
-}
-
-; TODO: this should be poison
-
-define i32 @poison2(i32 %x) {
-; CHECK-LABEL: @poison2(
-; CHECK-NEXT:    ret i32 0
-;
-  %v = urem i32 poison, %x
-  ret i32 %v
-}
diff --git a/llvm/test/Transforms/InstSimplify/undef.ll b/llvm/test/Transforms/InstSimplify/undef.ll
index d09dc43da091..fe1f412d3d37 100644
--- a/llvm/test/Transforms/InstSimplify/undef.ll
+++ b/llvm/test/Transforms/InstSimplify/undef.ll
@@ -1,9 +1,8 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -instsimplify -S < %s | FileCheck %s
 
 define i64 @test0() {
 ; CHECK-LABEL: @test0(
-; CHECK-NEXT:    ret i64 undef
+; CHECK:         ret i64 undef
 ;
   %r = mul i64 undef, undef
   ret i64 %r
@@ -11,7 +10,7 @@ define i64 @test0() {
 
 define i64 @test1() {
 ; CHECK-LABEL: @test1(
-; CHECK-NEXT:    ret i64 undef
+; CHECK:         ret i64 undef
 ;
   %r = mul i64 3, undef
   ret i64 %r
@@ -19,7 +18,7 @@ define i64 @test1() {
 
 define i64 @test2() {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    ret i64 undef
+; CHECK:         ret i64 undef
 ;
   %r = mul i64 undef, 3
   ret i64 %r
@@ -27,7 +26,7 @@ define i64 @test2() {
 
 define i64 @test3() {
 ; CHECK-LABEL: @test3(
-; CHECK-NEXT:    ret i64 0
+; CHECK:         ret i64 0
 ;
   %r = mul i64 undef, 6
   ret i64 %r
@@ -35,7 +34,7 @@ define i64 @test3() {
 
 define i64 @test4() {
 ; CHECK-LABEL: @test4(
-; CHECK-NEXT:    ret i64 0
+; CHECK:         ret i64 0
 ;
   %r = mul i64 6, undef
   ret i64 %r
@@ -43,7 +42,7 @@ define i64 @test4() {
 
 define i64 @test5() {
 ; CHECK-LABEL: @test5(
-; CHECK-NEXT:    ret i64 undef
+; CHECK:         ret i64 undef
 ;
   %r = and i64 undef, undef
   ret i64 %r
@@ -51,7 +50,7 @@ define i64 @test5() {
 
 define i64 @test6() {
 ; CHECK-LABEL: @test6(
-; CHECK-NEXT:    ret i64 undef
+; CHECK:         ret i64 undef
 ;
   %r = or i64 undef, undef
   ret i64 %r
@@ -59,7 +58,7 @@ define i64 @test6() {
 
 define i64 @test7() {
 ; CHECK-LABEL: @test7(
-; CHECK-NEXT:    ret i64 undef
+; CHECK:         ret i64 undef
 ;
   %r = udiv i64 undef, 1
   ret i64 %r
@@ -67,7 +66,7 @@ define i64 @test7() {
 
 define i64 @test8() {
 ; CHECK-LABEL: @test8(
-; CHECK-NEXT:    ret i64 undef
+; CHECK:         ret i64 undef
 ;
   %r = sdiv i64 undef, 1
   ret i64 %r
@@ -75,7 +74,7 @@ define i64 @test8() {
 
 define i64 @test9() {
 ; CHECK-LABEL: @test9(
-; CHECK-NEXT:    ret i64 0
+; CHECK:         ret i64 0
 ;
   %r = urem i64 undef, 1
   ret i64 %r
@@ -83,7 +82,7 @@ define i64 @test9() {
 
 define i64 @test10() {
 ; CHECK-LABEL: @test10(
-; CHECK-NEXT:    ret i64 0
+; CHECK:         ret i64 0
 ;
   %r = srem i64 undef, 1
   ret i64 %r
@@ -91,7 +90,7 @@ define i64 @test10() {
 
 define i64 @test11() {
 ; CHECK-LABEL: @test11(
-; CHECK-NEXT:    ret i64 poison
+; CHECK:         ret i64 undef
 ;
   %r = shl i64 undef, undef
   ret i64 %r
@@ -99,7 +98,7 @@ define i64 @test11() {
 
 define i64 @test11b(i64 %a) {
 ; CHECK-LABEL: @test11b(
-; CHECK-NEXT:    ret i64 poison
+; CHECK:         ret i64 poison
 ;
   %r = shl i64 %a, undef
   ret i64 %r
@@ -107,7 +106,7 @@ define i64 @test11b(i64 %a) {
 
 define i64 @test12() {
 ; CHECK-LABEL: @test12(
-; CHECK-NEXT:    ret i64 poison
+; CHECK:         ret i64 undef
 ;
   %r = ashr i64 undef, undef
   ret i64 %r
@@ -115,7 +114,7 @@ define i64 @test12() {
 
 define i64 @test12b(i64 %a) {
 ; CHECK-LABEL: @test12b(
-; CHECK-NEXT:    ret i64 poison
+; CHECK:         ret i64 poison
 ;
   %r = ashr i64 %a, undef
   ret i64 %r
@@ -123,7 +122,7 @@ define i64 @test12b(i64 %a) {
 
 define i64 @test13() {
 ; CHECK-LABEL: @test13(
-; CHECK-NEXT:    ret i64 poison
+; CHECK:         ret i64 undef
 ;
   %r = lshr i64 undef, undef
   ret i64 %r
@@ -131,7 +130,7 @@ define i64 @test13() {
 
 define i64 @test13b(i64 %a) {
 ; CHECK-LABEL: @test13b(
-; CHECK-NEXT:    ret i64 poison
+; CHECK:         ret i64 poison
 ;
   %r = lshr i64 %a, undef
   ret i64 %r
@@ -139,7 +138,7 @@ define i64 @test13b(i64 %a) {
 
 define i1 @test14() {
 ; CHECK-LABEL: @test14(
-; CHECK-NEXT:    ret i1 undef
+; CHECK:         ret i1 undef
 ;
   %r = icmp slt i64 undef, undef
   ret i1 %r
@@ -147,7 +146,7 @@ define i1 @test14() {
 
 define i1 @test15() {
 ; CHECK-LABEL: @test15(
-; CHECK-NEXT:    ret i1 undef
+; CHECK:         ret i1 undef
 ;
   %r = icmp ult i64 undef, undef
   ret i1 %r
@@ -155,7 +154,7 @@ define i1 @test15() {
 
 define i64 @test16(i64 %a) {
 ; CHECK-LABEL: @test16(
-; CHECK-NEXT:    ret i64 undef
+; CHECK:         ret i64 undef
 ;
   %r = select i1 undef, i64 %a, i64 undef
   ret i64 %r
@@ -163,7 +162,7 @@ define i64 @test16(i64 %a) {
 
 define i64 @test17(i64 %a) {
 ; CHECK-LABEL: @test17(
-; CHECK-NEXT:    ret i64 undef
+; CHECK:         ret i64 undef
 ;
   %r = select i1 undef, i64 undef, i64 %a
   ret i64 %r
@@ -171,7 +170,7 @@ define i64 @test17(i64 %a) {
 
 define i64 @test18(i64 %a) {
 ; CHECK-LABEL: @test18(
-; CHECK-NEXT:    [[R:%.*]] = call i64 undef(i64 [[A:%.*]])
+; CHECK:         [[R:%.*]] = call i64 undef(i64 %a)
 ; CHECK-NEXT:    ret i64 poison
 ;
   %r = call i64 (i64) undef(i64 %a)
@@ -180,7 +179,7 @@ define i64 @test18(i64 %a) {
 
 define <4 x i8> @test19(<4 x i8> %a) {
 ; CHECK-LABEL: @test19(
-; CHECK-NEXT:    ret <4 x i8> poison
+; CHECK:         ret <4 x i8> poison
 ;
   %b = shl <4 x i8> %a, <i8 8, i8 9, i8 undef, i8 -1>
   ret <4 x i8> %b
@@ -188,7 +187,7 @@ define <4 x i8> @test19(<4 x i8> %a) {
 
 define i32 @test20(i32 %a) {
 ; CHECK-LABEL: @test20(
-; CHECK-NEXT:    ret i32 poison
+; CHECK:         ret i32 poison
 ;
   %b = udiv i32 %a, 0
   ret i32 %b
@@ -204,7 +203,7 @@ define <2 x i32> @test20vec(<2 x i32> %a) {
 
 define i32 @test21(i32 %a) {
 ; CHECK-LABEL: @test21(
-; CHECK-NEXT:    ret i32 poison
+; CHECK:         ret i32 poison
 ;
   %b = sdiv i32 %a, 0
   ret i32 %b
@@ -220,7 +219,7 @@ define <2 x i32> @test21vec(<2 x i32> %a) {
 
 define i32 @test22(i32 %a) {
 ; CHECK-LABEL: @test22(
-; CHECK-NEXT:    ret i32 undef
+; CHECK:         ret i32 undef
 ;
   %b = ashr exact i32 undef, %a
   ret i32 %b
@@ -228,7 +227,7 @@ define i32 @test22(i32 %a) {
 
 define i32 @test23(i32 %a) {
 ; CHECK-LABEL: @test23(
-; CHECK-NEXT:    ret i32 undef
+; CHECK:         ret i32 undef
 ;
   %b = lshr exact i32 undef, %a
   ret i32 %b
@@ -236,7 +235,7 @@ define i32 @test23(i32 %a) {
 
 define i32 @test24() {
 ; CHECK-LABEL: @test24(
-; CHECK-NEXT:    ret i32 poison
+; CHECK:         ret i32 undef
 ;
   %b = udiv i32 undef, 0
   ret i32 %b
@@ -244,7 +243,7 @@ define i32 @test24() {
 
 define i32 @test25() {
 ; CHECK-LABEL: @test25(
-; CHECK-NEXT:    ret i32 poison
+; CHECK:         ret i32 undef
 ;
   %b = lshr i32 0, undef
   ret i32 %b
@@ -252,7 +251,7 @@ define i32 @test25() {
 
 define i32 @test26() {
 ; CHECK-LABEL: @test26(
-; CHECK-NEXT:    ret i32 poison
+; CHECK:         ret i32 undef
 ;
   %b = ashr i32 0, undef
   ret i32 %b
@@ -260,7 +259,7 @@ define i32 @test26() {
 
 define i32 @test27() {
 ; CHECK-LABEL: @test27(
-; CHECK-NEXT:    ret i32 poison
+; CHECK:         ret i32 undef
 ;
   %b = shl i32 0, undef
   ret i32 %b
@@ -268,7 +267,7 @@ define i32 @test27() {
 
 define i32 @test28(i32 %a) {
 ; CHECK-LABEL: @test28(
-; CHECK-NEXT:    ret i32 undef
+; CHECK:         ret i32 undef
 ;
   %b = shl nsw i32 undef, %a
   ret i32 %b
@@ -276,7 +275,7 @@ define i32 @test28(i32 %a) {
 
 define i32 @test29(i32 %a) {
 ; CHECK-LABEL: @test29(
-; CHECK-NEXT:    ret i32 undef
+; CHECK:         ret i32 undef
 ;
   %b = shl nuw i32 undef, %a
   ret i32 %b
@@ -284,7 +283,7 @@ define i32 @test29(i32 %a) {
 
 define i32 @test30(i32 %a) {
 ; CHECK-LABEL: @test30(
-; CHECK-NEXT:    ret i32 undef
+; CHECK:         ret i32 undef
 ;
   %b = shl nsw nuw i32 undef, %a
   ret i32 %b
@@ -292,7 +291,7 @@ define i32 @test30(i32 %a) {
 
 define i32 @test31(i32 %a) {
 ; CHECK-LABEL: @test31(
-; CHECK-NEXT:    ret i32 0
+; CHECK:         ret i32 0
 ;
   %b = shl i32 undef, %a
   ret i32 %b
@@ -300,7 +299,7 @@ define i32 @test31(i32 %a) {
 
 define i32 @test32(i32 %a) {
 ; CHECK-LABEL: @test32(
-; CHECK-NEXT:    ret i32 undef
+; CHECK:         ret i32 undef
 ;
   %b = shl i32 undef, 0
   ret i32 %b
@@ -308,7 +307,7 @@ define i32 @test32(i32 %a) {
 
 define i32 @test33(i32 %a) {
 ; CHECK-LABEL: @test33(
-; CHECK-NEXT:    ret i32 undef
+; CHECK:         ret i32 undef
 ;
   %b = ashr i32 undef, 0
   ret i32 %b
@@ -316,7 +315,7 @@ define i32 @test33(i32 %a) {
 
 define i32 @test34(i32 %a) {
 ; CHECK-LABEL: @test34(
-; CHECK-NEXT:    ret i32 undef
+; CHECK:         ret i32 undef
 ;
   %b = lshr i32 undef, 0
   ret i32 %b
@@ -324,7 +323,7 @@ define i32 @test34(i32 %a) {
 
 define i32 @test35(<4 x i32> %V) {
 ; CHECK-LABEL: @test35(
-; CHECK-NEXT:    ret i32 poison
+; CHECK:         ret i32 poison
 ;
   %b = extractelement <4 x i32> %V, i32 4
   ret i32 %b
@@ -332,7 +331,7 @@ define i32 @test35(<4 x i32> %V) {
 
 define i32 @test36(i32 %V) {
 ; CHECK-LABEL: @test36(
-; CHECK-NEXT:    ret i32 undef
+; CHECK:         ret i32 undef
 ;
   %b = extractelement <4 x i32> undef, i32 %V
   ret i32 %b
@@ -340,7 +339,7 @@ define i32 @test36(i32 %V) {
 
 define i32 @test37() {
 ; CHECK-LABEL: @test37(
-; CHECK-NEXT:    ret i32 poison
+; CHECK:         ret i32 undef
 ;
   %b = udiv i32 undef, undef
   ret i32 %b
@@ -348,7 +347,7 @@ define i32 @test37() {
 
 define i32 @test38(i32 %a) {
 ; CHECK-LABEL: @test38(
-; CHECK-NEXT:    ret i32 poison
+; CHECK:         ret i32 poison
 ;
   %b = udiv i32 %a, undef
   ret i32 %b
@@ -356,7 +355,7 @@ define i32 @test38(i32 %a) {
 
 define i32 @test39() {
 ; CHECK-LABEL: @test39(
-; CHECK-NEXT:    ret i32 poison
+; CHECK:         ret i32 undef
 ;
   %b = udiv i32 0, undef
   ret i32 %b
diff --git a/llvm/test/Transforms/SROA/phi-gep.ll b/llvm/test/Transforms/SROA/phi-gep.ll
index 915ae546beda..6bf2a7718658 100644
--- a/llvm/test/Transforms/SROA/phi-gep.ll
+++ b/llvm/test/Transforms/SROA/phi-gep.ll
@@ -348,7 +348,7 @@ define void @test_sroa_gep_phi_select_same_block() {
 ; CHECK-NEXT:    [[PHI:%.*]] = phi %pair* [ [[ALLOCA]], [[ENTRY:%.*]] ], [ [[SELECT:%.*]], [[WHILE_BODY]] ]
 ; CHECK-NEXT:    [[SELECT]] = select i1 undef, %pair* [[PHI]], %pair* undef
 ; CHECK-NEXT:    [[PHI_SROA_GEP:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[PHI]], i64 1
-; CHECK-NEXT:    [[SELECT_SROA_SEL:%.*]] = select i1 undef, %pair* [[PHI_SROA_GEP]], %pair* poison
+; CHECK-NEXT:    [[SELECT_SROA_SEL:%.*]] = select i1 undef, %pair* [[PHI_SROA_GEP]], %pair* undef
 ; CHECK-NEXT:    br i1 undef, label [[EXIT:%.*]], label [[WHILE_BODY]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    unreachable
diff --git a/llvm/test/Transforms/SROA/select-gep.ll b/llvm/test/Transforms/SROA/select-gep.ll
index f69cfeb410bd..93cb3420d0af 100644
--- a/llvm/test/Transforms/SROA/select-gep.ll
+++ b/llvm/test/Transforms/SROA/select-gep.ll
@@ -83,7 +83,7 @@ define i32 @test_sroa_select_gep_undef(i1 %cond) {
 ; CHECK-LABEL: @test_sroa_select_gep_undef(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[A_SROA_0:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[SELECT_SROA_SEL:%.*]] = select i1 [[COND:%.*]], i32* [[A_SROA_0]], i32* poison
+; CHECK-NEXT:    [[SELECT_SROA_SEL:%.*]] = select i1 [[COND:%.*]], i32* [[A_SROA_0]], i32* undef
 ; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[SELECT_SROA_SEL]], align 4
 ; CHECK-NEXT:    ret i32 [[LOAD]]
 ;
diff --git a/llvm/test/Transforms/VectorCombine/X86/insert-binop-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/insert-binop-inseltpoison.ll
index 8a6b1e98c968..b9d82e9f81df 100644
--- a/llvm/test/Transforms/VectorCombine/X86/insert-binop-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/insert-binop-inseltpoison.ll
@@ -128,7 +128,7 @@ define <2 x i64> @ins1_ins1_sdiv(i64 %x, i64 %y) {
 define <2 x i64> @ins1_ins1_udiv(i64 %x, i64 %y) {
 ; CHECK-LABEL: @ins1_ins1_udiv(
 ; CHECK-NEXT:    [[R_SCALAR:%.*]] = udiv i64 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i64> <i64 6, i64 poison>, i64 [[R_SCALAR]], i64 1
+; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i64> <i64 6, i64 undef>, i64 [[R_SCALAR]], i64 1
 ; CHECK-NEXT:    ret <2 x i64> [[R]]
 ;
   %i0 = insertelement <2 x i64> <i64 42, i64 undef>, i64 %x, i32 1
@@ -143,7 +143,7 @@ define <2 x i64> @ins1_ins1_udiv(i64 %x, i64 %y) {
 define <2 x i64> @ins1_ins1_urem(i64 %x, i64 %y) {
 ; CHECK-LABEL: @ins1_ins1_urem(
 ; CHECK-NEXT:    [[R_SCALAR:%.*]] = urem i64 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[R_SCALAR]], i64 1
+; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i64> <i64 undef, i64 0>, i64 [[R_SCALAR]], i64 1
 ; CHECK-NEXT:    ret <2 x i64> [[R]]
 ;
   %i0 = insertelement <2 x i64> <i64 42, i64 undef>, i64 %x, i64 1
diff --git a/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant.ll b/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant.ll
index 0637b5005683..a400e8f42907 100644
--- a/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant.ll
@@ -131,7 +131,7 @@ define <16 x i8> @mul_constant_multiuse(i8 %a0, <16 x i8> %a1) {
 define <2 x i64> @shl_constant_op0(i64 %x) {
 ; CHECK-LABEL: @shl_constant_op0(
 ; CHECK-NEXT:    [[BO_SCALAR:%.*]] = shl i64 2, [[X:%.*]]
-; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> poison, i64 [[BO_SCALAR]], i64 1
+; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> undef, i64 [[BO_SCALAR]], i64 1
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ins = insertelement <2 x i64> undef, i64 %x, i32 1
@@ -142,7 +142,7 @@ define <2 x i64> @shl_constant_op0(i64 %x) {
 define <2 x i64> @shl_constant_op0_not_undef_lane(i64 %x) {
 ; CHECK-LABEL: @shl_constant_op0_not_undef_lane(
 ; CHECK-NEXT:    [[BO_SCALAR:%.*]] = shl i64 2, [[X:%.*]]
-; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> poison, i64 [[BO_SCALAR]], i64 1
+; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> undef, i64 [[BO_SCALAR]], i64 1
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ins = insertelement <2 x i64> undef, i64 %x, i32 1
@@ -189,7 +189,7 @@ define <4 x i32> @shl_constant_op0_multiuse(i32 %a0, <4 x i32> %a1) {
 define <2 x i64> @shl_constant_op1(i64 %x) {
 ; CHECK-LABEL: @shl_constant_op1(
 ; CHECK-NEXT:    [[BO_SCALAR:%.*]] = shl nuw i64 [[X:%.*]], 5
-; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> <i64 0, i64 poison>, i64 [[BO_SCALAR]], i64 0
+; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> <i64 0, i64 undef>, i64 [[BO_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ins = insertelement <2 x i64> undef, i64 %x, i32 0
@@ -224,7 +224,7 @@ define <2 x i64> @shl_constant_op1_load(i64* %p) {
 define <2 x i64> @ashr_constant_op0(i64 %x) {
 ; CHECK-LABEL: @ashr_constant_op0(
 ; CHECK-NEXT:    [[BO_SCALAR:%.*]] = ashr exact i64 2, [[X:%.*]]
-; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> poison, i64 [[BO_SCALAR]], i64 1
+; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> undef, i64 [[BO_SCALAR]], i64 1
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ins = insertelement <2 x i64> undef, i64 %x, i32 1
@@ -235,7 +235,7 @@ define <2 x i64> @ashr_constant_op0(i64 %x) {
 define <2 x i64> @ashr_constant_op0_not_undef_lane(i64 %x) {
 ; CHECK-LABEL: @ashr_constant_op0_not_undef_lane(
 ; CHECK-NEXT:    [[BO_SCALAR:%.*]] = ashr exact i64 2, [[X:%.*]]
-; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> poison, i64 [[BO_SCALAR]], i64 1
+; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> undef, i64 [[BO_SCALAR]], i64 1
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ins = insertelement <2 x i64> undef, i64 %x, i32 1
@@ -246,7 +246,7 @@ define <2 x i64> @ashr_constant_op0_not_undef_lane(i64 %x) {
 define <2 x i64> @ashr_constant_op1(i64 %x) {
 ; CHECK-LABEL: @ashr_constant_op1(
 ; CHECK-NEXT:    [[BO_SCALAR:%.*]] = ashr i64 [[X:%.*]], 5
-; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> <i64 0, i64 poison>, i64 [[BO_SCALAR]], i64 0
+; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> <i64 0, i64 undef>, i64 [[BO_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ins = insertelement <2 x i64> undef, i64 %x, i32 0
@@ -268,7 +268,7 @@ define <2 x i64> @ashr_constant_op1_not_undef_lane(i64 %x) {
 define <2 x i64> @lshr_constant_op0(i64 %x) {
 ; CHECK-LABEL: @lshr_constant_op0(
 ; CHECK-NEXT:    [[BO_SCALAR:%.*]] = lshr i64 5, [[X:%.*]]
-; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> poison, i64 [[BO_SCALAR]], i64 0
+; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> undef, i64 [[BO_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ins = insertelement <2 x i64> undef, i64 %x, i32 0
@@ -279,7 +279,7 @@ define <2 x i64> @lshr_constant_op0(i64 %x) {
 define <2 x i64> @lshr_constant_op0_not_undef_lane(i64 %x) {
 ; CHECK-LABEL: @lshr_constant_op0_not_undef_lane(
 ; CHECK-NEXT:    [[BO_SCALAR:%.*]] = lshr i64 5, [[X:%.*]]
-; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> poison, i64 [[BO_SCALAR]], i64 0
+; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> undef, i64 [[BO_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ins = insertelement <2 x i64> undef, i64 %x, i32 0
@@ -290,7 +290,7 @@ define <2 x i64> @lshr_constant_op0_not_undef_lane(i64 %x) {
 define <2 x i64> @lshr_constant_op1(i64 %x) {
 ; CHECK-LABEL: @lshr_constant_op1(
 ; CHECK-NEXT:    [[BO_SCALAR:%.*]] = lshr exact i64 [[X:%.*]], 2
-; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[BO_SCALAR]], i64 1
+; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> <i64 undef, i64 0>, i64 [[BO_SCALAR]], i64 1
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ins = insertelement <2 x i64> undef, i64 %x, i32 1
@@ -312,7 +312,7 @@ define <2 x i64> @lshr_constant_op1_not_undef_lane(i64 %x) {
 define <2 x i64> @urem_constant_op0(i64 %x) {
 ; CHECK-LABEL: @urem_constant_op0(
 ; CHECK-NEXT:    [[BO_SCALAR:%.*]] = urem i64 5, [[X:%.*]]
-; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> poison, i64 [[BO_SCALAR]], i64 0
+; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> undef, i64 [[BO_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ins = insertelement <2 x i64> undef, i64 %x, i32 0
@@ -323,7 +323,7 @@ define <2 x i64> @urem_constant_op0(i64 %x) {
 define <2 x i64> @urem_constant_op0_not_undef_lane(i64 %x) {
 ; CHECK-LABEL: @urem_constant_op0_not_undef_lane(
 ; CHECK-NEXT:    [[BO_SCALAR:%.*]] = urem i64 5, [[X:%.*]]
-; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> poison, i64 [[BO_SCALAR]], i64 0
+; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> undef, i64 [[BO_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ins = insertelement <2 x i64> undef, i64 %x, i32 0
@@ -334,7 +334,7 @@ define <2 x i64> @urem_constant_op0_not_undef_lane(i64 %x) {
 define <2 x i64> @urem_constant_op1(i64 %x) {
 ; CHECK-LABEL: @urem_constant_op1(
 ; CHECK-NEXT:    [[BO_SCALAR:%.*]] = urem i64 [[X:%.*]], 2
-; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[BO_SCALAR]], i64 1
+; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> <i64 undef, i64 0>, i64 [[BO_SCALAR]], i64 1
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ins = insertelement <2 x i64> undef, i64 %x, i32 1
@@ -356,7 +356,7 @@ define <2 x i64> @urem_constant_op1_not_undef_lane(i64 %x) {
 define <2 x i64> @srem_constant_op0(i64 %x) {
 ; CHECK-LABEL: @srem_constant_op0(
 ; CHECK-NEXT:    [[BO_SCALAR:%.*]] = srem i64 5, [[X:%.*]]
-; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> poison, i64 [[BO_SCALAR]], i64 0
+; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> undef, i64 [[BO_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ins = insertelement <2 x i64> undef, i64 %x, i32 0
@@ -367,7 +367,7 @@ define <2 x i64> @srem_constant_op0(i64 %x) {
 define <2 x i64> @srem_constant_op0_not_undef_lane(i64 %x) {
 ; CHECK-LABEL: @srem_constant_op0_not_undef_lane(
 ; CHECK-NEXT:    [[BO_SCALAR:%.*]] = srem i64 5, [[X:%.*]]
-; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> poison, i64 [[BO_SCALAR]], i64 0
+; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> undef, i64 [[BO_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ins = insertelement <2 x i64> undef, i64 %x, i32 0
@@ -378,7 +378,7 @@ define <2 x i64> @srem_constant_op0_not_undef_lane(i64 %x) {
 define <2 x i64> @srem_constant_op1(i64 %x) {
 ; CHECK-LABEL: @srem_constant_op1(
 ; CHECK-NEXT:    [[BO_SCALAR:%.*]] = srem i64 [[X:%.*]], 2
-; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[BO_SCALAR]], i64 1
+; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> <i64 undef, i64 0>, i64 [[BO_SCALAR]], i64 1
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ins = insertelement <2 x i64> undef, i64 %x, i32 1
@@ -400,7 +400,7 @@ define <2 x i64> @srem_constant_op1_not_undef_lane(i64 %x) {
 define <2 x i64> @udiv_constant_op0(i64 %x) {
 ; CHECK-LABEL: @udiv_constant_op0(
 ; CHECK-NEXT:    [[BO_SCALAR:%.*]] = udiv exact i64 5, [[X:%.*]]
-; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> poison, i64 [[BO_SCALAR]], i64 0
+; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> undef, i64 [[BO_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ins = insertelement <2 x i64> undef, i64 %x, i32 0
@@ -411,7 +411,7 @@ define <2 x i64> @udiv_constant_op0(i64 %x) {
 define <2 x i64> @udiv_constant_op0_not_undef_lane(i64 %x) {
 ; CHECK-LABEL: @udiv_constant_op0_not_undef_lane(
 ; CHECK-NEXT:    [[BO_SCALAR:%.*]] = udiv exact i64 5, [[X:%.*]]
-; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> poison, i64 [[BO_SCALAR]], i64 0
+; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> undef, i64 [[BO_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ins = insertelement <2 x i64> undef, i64 %x, i32 0
@@ -422,7 +422,7 @@ define <2 x i64> @udiv_constant_op0_not_undef_lane(i64 %x) {
 define <2 x i64> @udiv_constant_op1(i64 %x) {
 ; CHECK-LABEL: @udiv_constant_op1(
 ; CHECK-NEXT:    [[BO_SCALAR:%.*]] = udiv i64 [[X:%.*]], 2
-; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[BO_SCALAR]], i64 1
+; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> <i64 undef, i64 0>, i64 [[BO_SCALAR]], i64 1
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ins = insertelement <2 x i64> undef, i64 %x, i32 1
@@ -444,7 +444,7 @@ define <2 x i64> @udiv_constant_op1_not_undef_lane(i64 %x) {
 define <2 x i64> @sdiv_constant_op0(i64 %x) {
 ; CHECK-LABEL: @sdiv_constant_op0(
 ; CHECK-NEXT:    [[BO_SCALAR:%.*]] = sdiv i64 5, [[X:%.*]]
-; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> poison, i64 [[BO_SCALAR]], i64 0
+; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> undef, i64 [[BO_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ins = insertelement <2 x i64> undef, i64 %x, i32 0
@@ -455,7 +455,7 @@ define <2 x i64> @sdiv_constant_op0(i64 %x) {
 define <2 x i64> @sdiv_constant_op0_not_undef_lane(i64 %x) {
 ; CHECK-LABEL: @sdiv_constant_op0_not_undef_lane(
 ; CHECK-NEXT:    [[BO_SCALAR:%.*]] = sdiv i64 5, [[X:%.*]]
-; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> poison, i64 [[BO_SCALAR]], i64 0
+; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> undef, i64 [[BO_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ins = insertelement <2 x i64> undef, i64 %x, i32 0
@@ -466,7 +466,7 @@ define <2 x i64> @sdiv_constant_op0_not_undef_lane(i64 %x) {
 define <2 x i64> @sdiv_constant_op1(i64 %x) {
 ; CHECK-LABEL: @sdiv_constant_op1(
 ; CHECK-NEXT:    [[BO_SCALAR:%.*]] = sdiv exact i64 [[X:%.*]], 2
-; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[BO_SCALAR]], i64 1
+; CHECK-NEXT:    [[BO:%.*]] = insertelement <2 x i64> <i64 undef, i64 0>, i64 [[BO_SCALAR]], i64 1
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
 ;
   %ins = insertelement <2 x i64> undef, i64 %x, i32 1
diff --git a/llvm/test/Transforms/VectorCombine/X86/insert-binop.ll b/llvm/test/Transforms/VectorCombine/X86/insert-binop.ll
index 4fd33cc7ef28..abebf4d809af 100644
--- a/llvm/test/Transforms/VectorCombine/X86/insert-binop.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/insert-binop.ll
@@ -53,7 +53,7 @@ define <2 x i64> @ins1_ins1_iterate(i64 %w, i64 %x, i64 %y, i64 %z) {
 ; CHECK-NEXT:    [[S0_SCALAR:%.*]] = sub i64 [[W:%.*]], [[X:%.*]]
 ; CHECK-NEXT:    [[S1_SCALAR:%.*]] = or i64 [[S0_SCALAR]], [[Y:%.*]]
 ; CHECK-NEXT:    [[S2_SCALAR:%.*]] = shl i64 [[Z:%.*]], [[S1_SCALAR]]
-; CHECK-NEXT:    [[S2:%.*]] = insertelement <2 x i64> poison, i64 [[S2_SCALAR]], i64 1
+; CHECK-NEXT:    [[S2:%.*]] = insertelement <2 x i64> undef, i64 [[S2_SCALAR]], i64 1
 ; CHECK-NEXT:    ret <2 x i64> [[S2]]
 ;
   %i0 = insertelement <2 x i64> undef, i64 %w, i64 1
@@ -128,7 +128,7 @@ define <2 x i64> @ins1_ins1_sdiv(i64 %x, i64 %y) {
 define <2 x i64> @ins1_ins1_udiv(i64 %x, i64 %y) {
 ; CHECK-LABEL: @ins1_ins1_udiv(
 ; CHECK-NEXT:    [[R_SCALAR:%.*]] = udiv i64 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i64> <i64 6, i64 poison>, i64 [[R_SCALAR]], i64 1
+; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i64> <i64 6, i64 undef>, i64 [[R_SCALAR]], i64 1
 ; CHECK-NEXT:    ret <2 x i64> [[R]]
 ;
   %i0 = insertelement <2 x i64> <i64 42, i64 undef>, i64 %x, i32 1
@@ -143,7 +143,7 @@ define <2 x i64> @ins1_ins1_udiv(i64 %x, i64 %y) {
 define <2 x i64> @ins1_ins1_urem(i64 %x, i64 %y) {
 ; CHECK-LABEL: @ins1_ins1_urem(
 ; CHECK-NEXT:    [[R_SCALAR:%.*]] = urem i64 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[R_SCALAR]], i64 1
+; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i64> <i64 undef, i64 0>, i64 [[R_SCALAR]], i64 1
 ; CHECK-NEXT:    ret <2 x i64> [[R]]
 ;
   %i0 = insertelement <2 x i64> <i64 42, i64 undef>, i64 %x, i64 1
diff --git a/llvm/unittests/IR/ConstantsTest.cpp b/llvm/unittests/IR/ConstantsTest.cpp
index 9eabc7c55638..96d3672647e8 100644
--- a/llvm/unittests/IR/ConstantsTest.cpp
+++ b/llvm/unittests/IR/ConstantsTest.cpp
@@ -27,7 +27,7 @@ TEST(ConstantsTest, Integer_i1) {
   Constant* Zero = ConstantInt::get(Int1, 0);
   Constant* NegOne = ConstantInt::get(Int1, static_cast<uint64_t>(-1), true);
   EXPECT_EQ(NegOne, ConstantInt::getSigned(Int1, -1));
-  Constant* Poison = PoisonValue::get(Int1);
+  Constant* Undef = UndefValue::get(Int1);
 
   // Input:  @b = constant i1 add(i1 1 , i1 1)
   // Output: @b = constant i1 false
@@ -53,21 +53,21 @@ TEST(ConstantsTest, Integer_i1) {
   // @g = constant i1 false
   EXPECT_EQ(Zero, ConstantExpr::getSub(One, One));
 
-  // @h = constant i1 shl(i1 1 , i1 1)  ; poison
-  // @h = constant i1 poison
-  EXPECT_EQ(Poison, ConstantExpr::getShl(One, One));
+  // @h = constant i1 shl(i1 1 , i1 1)  ; undefined
+  // @h = constant i1 undef
+  EXPECT_EQ(Undef, ConstantExpr::getShl(One, One));
 
   // @i = constant i1 shl(i1 1 , i1 0)
   // @i = constant i1 true
   EXPECT_EQ(One, ConstantExpr::getShl(One, Zero));
 
-  // @j = constant i1 lshr(i1 1, i1 1)  ; poison
-  // @j = constant i1 poison
-  EXPECT_EQ(Poison, ConstantExpr::getLShr(One, One));
+  // @j = constant i1 lshr(i1 1, i1 1)  ; undefined
+  // @j = constant i1 undef
+  EXPECT_EQ(Undef, ConstantExpr::getLShr(One, One));
 
-  // @m = constant i1 ashr(i1 1, i1 1)  ; poison
-  // @m = constant i1 poison
-  EXPECT_EQ(Poison, ConstantExpr::getAShr(One, One));
+  // @m = constant i1 ashr(i1 1, i1 1)  ; undefined
+  // @m = constant i1 undef
+  EXPECT_EQ(Undef, ConstantExpr::getAShr(One, One));
 
   // @n = constant i1 mul(i1 -1, i1 1)
   // @n = constant i1 true
@@ -218,6 +218,7 @@ TEST(ConstantsTest, AsInstructionsTest) {
   Constant *Elt = ConstantInt::get(Int16Ty, 2015);
   Constant *Poison16 = PoisonValue::get(Int16Ty);
   Constant *Undef64  = UndefValue::get(Int64Ty);
+  Constant *UndefV16 = UndefValue::get(P6->getType());
   Constant *PoisonV16 = PoisonValue::get(P6->getType());
 
   #define P0STR "ptrtoint (i32** @dummy to i32)"
@@ -294,8 +295,8 @@ TEST(ConstantsTest, AsInstructionsTest) {
 
   EXPECT_EQ(Elt, ConstantExpr::getExtractElement(
                  ConstantExpr::getInsertElement(P6, Elt, One), One));
-  EXPECT_EQ(PoisonV16, ConstantExpr::getInsertElement(P6, Elt, Two));
-  EXPECT_EQ(PoisonV16, ConstantExpr::getInsertElement(P6, Elt, Big));
+  EXPECT_EQ(UndefV16, ConstantExpr::getInsertElement(P6, Elt, Two));
+  EXPECT_EQ(UndefV16, ConstantExpr::getInsertElement(P6, Elt, Big));
   EXPECT_EQ(PoisonV16, ConstantExpr::getInsertElement(P6, Elt, Undef64));
 }
 

From 91f34dabb92d8446142b3c5777fa83e6bcbdfa7e Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Tue, 2 Feb 2021 18:41:49 -0800
Subject: [PATCH 030/318] workflows: Re-enable lldb test on Mac OS X

---
 .github/workflows/lldb-tests.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/lldb-tests.yml b/.github/workflows/lldb-tests.yml
index 93fddc2de8c6..68aec6036995 100644
--- a/.github/workflows/lldb-tests.yml
+++ b/.github/workflows/lldb-tests.yml
@@ -20,14 +20,16 @@ jobs:
   build_lldb:
     name: lldb build
     runs-on: ${{ matrix.os }}
+    # Workaround for build faliure on Mac OS X: llvm.org/PR46190, https://github.com/actions/virtual-environments/issues/2274
+    env:
+      CPLUS_INCLUDE_PATH: /usr/local/opt/llvm/include/c++/v1:/Library/Developer/CommandLineTools/SDKs/MacOSX10.15.sdk/usr/include
     strategy:
       fail-fast: false
       matrix:
         os:
           - ubuntu-latest
           - windows-latest
-          # macOS build disabled due to: llvm.org/PR46190
-          #- macOS-latest
+          - macOS-latest
     steps:
     - name: Setup Windows
       if: startsWith(matrix.os, 'windows')

From 872608926129a61489d484e15cb9186882578c73 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Wed, 3 Feb 2021 03:09:24 +0000
Subject: [PATCH 031/318] workflows: Fix actions repository name for llvm tests

---
 .github/workflows/llvm-tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/llvm-tests.yml b/.github/workflows/llvm-tests.yml
index 1cffc3ef4d97..1fcd67a10078 100644
--- a/.github/workflows/llvm-tests.yml
+++ b/.github/workflows/llvm-tests.yml
@@ -56,7 +56,7 @@ jobs:
 
       - name: Get LLVM version
         id: version
-        uses: tstellar/actions/get-llvm-version@get-version
+        uses: llvm/actions/get-llvm-version@main
 
       - name: Setup Variables
         id: vars

From 2a57ea296a4787828b52799564d7ddf02ec1c4f3 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Mon, 1 Feb 2021 13:05:19 +0000
Subject: [PATCH 032/318] workflows: Add job to check for ABI changes in
 libclang.so and libclang-cpp.so

---
 .github/workflows/libclang-abi-tests.yml | 132 +++++++++++++++++++++++
 1 file changed, 132 insertions(+)
 create mode 100644 .github/workflows/libclang-abi-tests.yml

diff --git a/.github/workflows/libclang-abi-tests.yml b/.github/workflows/libclang-abi-tests.yml
new file mode 100644
index 000000000000..5681c7c8166e
--- /dev/null
+++ b/.github/workflows/libclang-abi-tests.yml
@@ -0,0 +1,132 @@
+name: libclang ABI Tests
+
+on:
+  push:
+    branches:
+      - 'release/**'
+    paths:
+      - 'clang/**'
+      - '.github/workflows/libclang-abi-tests.yml'
+  pull_request:
+    paths:
+      - 'clang/**'
+      - '.github/workflows/libclang-abi-tests.yml'
+
+jobs:
+  abi-dump-setup:
+    runs-on: ubuntu-latest
+    outputs:
+      BASELINE_REF: ${{ steps.vars.outputs.BASELINE_REF }}
+      ABI_HEADERS: ${{ steps.vars.outputs.ABI_HEADERS }}
+      ABI_LIBS: ${{ steps.vars.outputs.ABI_LIBS }}
+      BASELINE_VERSION_MAJOR: ${{ steps.vars.outputs.BASELINE_VERSION_MAJOR }}
+      LLVM_VERSION_MAJOR: ${{ steps.version.outputs.LLVM_VERSION_MAJOR }}
+      LLVM_VERSION_MINOR: ${{ steps.version.outputs.LLVM_VERSION_MINOR }}
+      LLVM_VERSION_PATCH: ${{ steps.version.outputs.LLVM_VERSION_PATCH }}
+    steps:
+      - name: Checkout source
+        uses: actions/checkout@v1
+        with:
+          fetch-depth: 1
+
+      - name: Get LLVM version
+        id: version
+        uses: llvm/actions/get-llvm-version@main
+
+      - name: Setup Variables
+        id: vars
+        run: |
+          if [ ${{ steps.version.outputs.LLVM_VERSION_MINOR }} -ne 0 -o ${{ steps.version.outputs.LLVM_VERSION_PATCH }} -eq 0 ]; then
+            echo ::set-output name=BASELINE_VERSION_MAJOR::$(( ${{ steps.version.outputs.LLVM_VERSION_MAJOR }} - 1))
+            echo ::set-output name=ABI_HEADERS::clang-c
+            echo ::set-output name=ABI_LIBS::libclang.so
+          else
+            echo ::set-output name=BASELINE_VERSION_MAJOR::${{ steps.version.outputs.LLVM_VERSION_MAJOR }}
+            echo ::set-output name=ABI_HEADERS::.
+            echo ::set-output name=ABI_LIBS::libclang.so libclang-cpp.so
+          fi
+
+  abi-dump:
+    needs: abi-dump-setup
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        name:
+          - build-baseline
+          - build-latest
+        include:
+          - name: build-baseline
+            llvm_version_major: ${{ needs.abi-dump-setup.outputs.BASELINE_VERSION_MAJOR }}
+            ref: llvmorg-${{ needs.abi-dump-setup.outputs.BASELINE_VERSION_MAJOR }}.0.0
+            repo: llvm/llvm-project
+          - name: build-latest
+            llvm_version_major: ${{ needs.abi-dump-setup.outputs.LLVM_VERSION_MAJOR }}
+            ref: ${{ github.sha }}
+            repo: ${{ github.repository }}
+    steps:
+    - name: Install Ninja
+      uses: llvm/actions/install-ninja@main
+    - name: Install abi-compliance-checker
+      run: |
+        sudo apt-get install abi-dumper autoconf pkg-config
+    - name: Install universal-ctags
+      run: |
+        git clone https://github.com/universal-ctags/ctags.git
+        cd ctags
+        ./autogen.sh
+        ./configure
+        sudo make install
+    - name: Download source code
+      uses: llvm/actions/get-llvm-project-src@main
+      with:
+        ref: ${{ matrix.ref }}
+        repo: ${{ matrix.repo }}
+    - name: Configure
+      run: |
+        mkdir install
+        cmake -B build -S llvm -G Ninja -DLLVM_ENABLE_PROJECTS=clang -DCMAKE_BUILD_TYPE=Debug -DLLVM_TARGETS_TO_BUILD="" -DLLVM_BUILD_LLVM_DYLIB=ON -DLLVM_LINK_LLVM_DYLIB=ON -DCMAKE_C_FLAGS_DEBUG="-g -Og" -DCMAKE_CXX_FLAGS_DEBUG="-g -Og" -DCMAKE_INSTALL_PREFIX=`pwd`/install llvm
+    - name: Build
+      run: ninja -C build/ ${{ needs.abi-dump-setup.outputs.ABI_LIBS }} install-clang-headers
+    - name: Dump ABI
+      run: |
+        parallel abi-dumper -lver ${{ matrix.ref }} -skip-cxx -public-headers ./install/include/${{ needs.abi-dump-setup.outputs.ABI_HEADERS }} -o {}-${{ matrix.ref }}.abi ./build/lib/{} ::: ${{ needs.abi-dump-setup.outputs.ABI_LIBS }}
+        for lib in ${{ needs.abi-dump-setup.outputs.ABI_LIBS }}; do
+          # Remove symbol versioning from dumps, so we can compare across major versions.
+          sed -i 's/LLVM_${{ matrix.llvm_version_major }}/LLVM_NOVERSION/' $lib-${{ matrix.ref }}.abi
+          tar -czf $lib-${{ matrix.ref }}.abi.tar.gz $lib-${{ matrix.ref }}.abi
+        done
+    - name: Upload ABI file
+      uses: actions/upload-artifact@v2
+      with:
+        name: ${{ matrix.name }}
+        path: "*${{ matrix.ref }}.abi.tar.gz"
+
+  abi-compare:
+    runs-on: ubuntu-latest
+    needs:
+      - abi-dump-setup
+      - abi-dump
+    steps:
+      - name: Download baseline
+        uses: actions/download-artifact@v1
+        with:
+          name: build-baseline
+      - name: Download latest
+        uses: actions/download-artifact@v1
+        with:
+          name: build-latest
+
+      - name: Install abi-compliance-checker
+        run: sudo apt-get install abi-compliance-checker
+      - name: Compare ABI
+        run: |
+          for lib in ${{ needs.abi-dump-setup.outputs.ABI_LIBS }}; do
+            abi-compliance-checker -lib $lib -old build-baseline/$lib*.abi.tar.gz -new build-latest/$lib*.abi.tar.gz
+          done
+      - name: Upload ABI Comparison
+        if: always()
+        uses: actions/upload-artifact@v2
+        with:
+          name: compat-report-${{ github.sha }}
+          path: compat_reports/
+

From c1899cd5102dbdacd006fdb33db075319ccc933f Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 28 Jan 2021 11:21:21 +0000
Subject: [PATCH 033/318] [X86][AVX] Add PR48908 shuffle test case

(cherry picked from commit da8845fc3d3bb0b0e133f020931440511fa72723)
---
 .../X86/vector-shuffle-combining-avx.ll       | 151 ++++++++++++++++++
 1 file changed, 151 insertions(+)

diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
index 979c365acfd7..3da83b25d363 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -436,6 +436,157 @@ entry:
   unreachable
 }
 
+define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x double>* noalias %out0, <4 x double>* noalias %out1, <4 x double>* noalias %out2) {
+; X86-AVX1-LABEL: PR48908:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[0,1]
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm4
+; X86-AVX1-NEXT:    vshufpd {{.*#+}} ymm3 = ymm4[1],ymm3[0],ymm4[2],ymm3[3]
+; X86-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm1[0,1],ymm0[0,1]
+; X86-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1]
+; X86-AVX1-NEXT:    vshufpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[2]
+; X86-AVX1-NEXT:    vmovapd %ymm4, (%edx)
+; X86-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm3[2,3,0,1]
+; X86-AVX1-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2],ymm3[3]
+; X86-AVX1-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]
+; X86-AVX1-NEXT:    vmovapd %ymm3, (%ecx)
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X86-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; X86-AVX1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
+; X86-AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
+; X86-AVX1-NEXT:    vmovapd %ymm0, (%eax)
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: PR48908:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3
+; X86-AVX2-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
+; X86-AVX2-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
+; X86-AVX2-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]
+; X86-AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1]
+; X86-AVX2-NEXT:    vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3]
+; X86-AVX2-NEXT:    vmovapd %ymm3, (%edx)
+; X86-AVX2-NEXT:    vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm0[2],ymm5[3]
+; X86-AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,0]
+; X86-AVX2-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]
+; X86-AVX2-NEXT:    vmovapd %ymm3, (%ecx)
+; X86-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X86-AVX2-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; X86-AVX2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
+; X86-AVX2-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
+; X86-AVX2-NEXT:    vmovapd %ymm0, (%eax)
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X86-AVX512-LABEL: PR48908:
+; X86-AVX512:       # %bb.0:
+; X86-AVX512-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
+; X86-AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; X86-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3
+; X86-AVX512-NEXT:    vshufpd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[2]
+; X86-AVX512-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
+; X86-AVX512-NEXT:    vshufpd {{.*#+}} ymm4 = ymm1[1],ymm4[0],ymm1[2],ymm4[3]
+; X86-AVX512-NEXT:    vmovapd {{.*#+}} ymm5 = [0,0,3,0,8,0,1,0]
+; X86-AVX512-NEXT:    vpermt2pd %zmm2, %zmm5, %zmm3
+; X86-AVX512-NEXT:    vmovapd %ymm3, (%edx)
+; X86-AVX512-NEXT:    vmovapd {{.*#+}} ymm3 = [0,0,3,0,10,0,1,0]
+; X86-AVX512-NEXT:    vpermt2pd %zmm0, %zmm3, %zmm4
+; X86-AVX512-NEXT:    vmovapd %ymm4, (%ecx)
+; X86-AVX512-NEXT:    vmovapd {{.*#+}} ymm3 = <3,0,11,0,u,u,u,u>
+; X86-AVX512-NEXT:    vpermi2pd %zmm1, %zmm0, %zmm3
+; X86-AVX512-NEXT:    vmovapd {{.*#+}} ymm0 = [2,0,8,0,9,0,3,0]
+; X86-AVX512-NEXT:    vpermi2pd %zmm3, %zmm2, %zmm0
+; X86-AVX512-NEXT:    vmovapd %ymm0, (%eax)
+; X86-AVX512-NEXT:    vzeroupper
+; X86-AVX512-NEXT:    retl
+;
+; X64-AVX1-LABEL: PR48908:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[0,1]
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm4
+; X64-AVX1-NEXT:    vshufpd {{.*#+}} ymm3 = ymm4[1],ymm3[0],ymm4[2],ymm3[3]
+; X64-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm1[0,1],ymm0[0,1]
+; X64-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1]
+; X64-AVX1-NEXT:    vshufpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[2]
+; X64-AVX1-NEXT:    vmovapd %ymm4, (%rdi)
+; X64-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm3[2,3,0,1]
+; X64-AVX1-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2],ymm3[3]
+; X64-AVX1-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]
+; X64-AVX1-NEXT:    vmovapd %ymm3, (%rsi)
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X64-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; X64-AVX1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
+; X64-AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
+; X64-AVX1-NEXT:    vmovapd %ymm0, (%rdx)
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: PR48908:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3
+; X64-AVX2-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
+; X64-AVX2-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
+; X64-AVX2-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]
+; X64-AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1]
+; X64-AVX2-NEXT:    vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3]
+; X64-AVX2-NEXT:    vmovapd %ymm3, (%rdi)
+; X64-AVX2-NEXT:    vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm0[2],ymm5[3]
+; X64-AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,0]
+; X64-AVX2-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]
+; X64-AVX2-NEXT:    vmovapd %ymm3, (%rsi)
+; X64-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X64-AVX2-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; X64-AVX2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
+; X64-AVX2-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
+; X64-AVX2-NEXT:    vmovapd %ymm0, (%rdx)
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: PR48908:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
+; X64-AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; X64-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; X64-AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3
+; X64-AVX512-NEXT:    vshufpd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[2]
+; X64-AVX512-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
+; X64-AVX512-NEXT:    vshufpd {{.*#+}} ymm4 = ymm1[1],ymm4[0],ymm1[2],ymm4[3]
+; X64-AVX512-NEXT:    vmovapd {{.*#+}} ymm5 = [0,3,8,1]
+; X64-AVX512-NEXT:    vpermt2pd %zmm2, %zmm5, %zmm3
+; X64-AVX512-NEXT:    vmovapd %ymm3, (%rdi)
+; X64-AVX512-NEXT:    vmovapd {{.*#+}} ymm3 = [0,3,10,1]
+; X64-AVX512-NEXT:    vpermt2pd %zmm0, %zmm3, %zmm4
+; X64-AVX512-NEXT:    vmovapd %ymm4, (%rsi)
+; X64-AVX512-NEXT:    vmovapd {{.*#+}} ymm3 = <3,11,u,u>
+; X64-AVX512-NEXT:    vpermi2pd %zmm1, %zmm0, %zmm3
+; X64-AVX512-NEXT:    vmovapd {{.*#+}} ymm0 = [2,8,9,3]
+; X64-AVX512-NEXT:    vpermi2pd %zmm3, %zmm2, %zmm0
+; X64-AVX512-NEXT:    vmovapd %ymm0, (%rdx)
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %t0 = shufflevector <4 x double> %v0, <4 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  %t1 = shufflevector <4 x double> %v1, <4 x double> %v2, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
+  %r0 = shufflevector <4 x double> %t0, <4 x double> %t1, <4 x i32> <i32 0, i32 3, i32 6, i32 1>
+  store <4 x double> %r0, <4 x double>* %out0, align 32
+  %r1 = shufflevector <4 x double> %t0, <4 x double> %t1, <4 x i32> <i32 4, i32 7, i32 2, i32 5>
+  store <4 x double> %r1, <4 x double>* %out1, align 32
+  %t2 = shufflevector <4 x double> %v0, <4 x double> %v1, <4 x i32> <i32 3, i32 7, i32 undef, i32 undef>
+  %r2 = shufflevector <4 x double> %t2, <4 x double> %v2, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
+  store <4 x double> %r2, <4 x double>* %out2, align 32
+  ret void
+}
+
 define <4 x i64> @concat_self_v4i64(<2 x i64> %x) {
 ; AVX1-LABEL: concat_self_v4i64:
 ; AVX1:       # %bb.0:

From 52a70a07e93c322ad137bce1a1ff2f1c9fdf6050 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 28 Jan 2021 12:11:31 +0000
Subject: [PATCH 034/318] [X86][AVX] canonicalizeLaneShuffleWithRepeatedOps -
 don't merge VPERMILPD ops with different low/high masks.

Unlike VPERMILPS, VPERMILPD can have non-repeating masks in each 128-bit subvector, we weren't accounting for this when folding vperm2f128(vpermilpd(x,c),vpermilpd(y,c)) -> vpermilpd(vperm2f128(x,y),c).

I'm intending to add support for this but wanted to get a minimal fix in first for merging into 12.xx.

Fixes PR48908

(cherry picked from commit 6663330bc8c84a75ea092272297b557bfc310380)
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  9 ++++-
 .../X86/vector-shuffle-combining-avx.ll       | 40 ++++++++++---------
 2 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0dd20235aa3c..6b816c710f98 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -36916,11 +36916,18 @@ static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
     Res = DAG.getNode(SrcOpc0, DL, SrcVT0, DAG.getBitcast(SrcVT0, Res));
     return DAG.getBitcast(VT, Res);
   }
+  case X86ISD::VPERMILPI:
+    // TODO: Handle v4f64 permutes with different low/high lane masks.
+    if (SrcVT0 == MVT::v4f64) {
+      uint64_t Mask = Src0.getConstantOperandVal(1);
+      if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
+        break;
+    }
+    LLVM_FALLTHROUGH;
   case X86ISD::VSHLI:
   case X86ISD::VSRLI:
   case X86ISD::VSRAI:
   case X86ISD::PSHUFD:
-  case X86ISD::VPERMILPI:
     if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
       SDValue LHS = DAG.getBitcast(VT, Src0.getOperand(0));
       SDValue RHS =
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
index 3da83b25d363..1a1153d0e886 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -442,16 +442,18 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[0,1]
-; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm4
-; X86-AVX1-NEXT:    vshufpd {{.*#+}} ymm3 = ymm4[1],ymm3[0],ymm4[2],ymm3[3]
-; X86-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm1[0,1],ymm0[0,1]
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3
+; X86-AVX1-NEXT:    vpermilpd {{.*#+}} ymm3 = ymm3[0,1,2,2]
+; X86-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm5
+; X86-AVX1-NEXT:    vshufpd {{.*#+}} ymm4 = ymm5[1],ymm4[0],ymm5[2],ymm4[3]
 ; X86-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1]
-; X86-AVX1-NEXT:    vshufpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[2]
-; X86-AVX1-NEXT:    vmovapd %ymm4, (%edx)
-; X86-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; X86-AVX1-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2],ymm3[3]
-; X86-AVX1-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]
+; X86-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1]
+; X86-AVX1-NEXT:    vblendpd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3]
+; X86-AVX1-NEXT:    vmovapd %ymm3, (%edx)
+; X86-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm4[2,3,0,1]
+; X86-AVX1-NEXT:    vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm0[2],ymm4[3]
+; X86-AVX1-NEXT:    vblendpd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3]
 ; X86-AVX1-NEXT:    vmovapd %ymm3, (%ecx)
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; X86-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
@@ -513,16 +515,18 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x
 ;
 ; X64-AVX1-LABEL: PR48908:
 ; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[0,1]
-; X64-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm4
-; X64-AVX1-NEXT:    vshufpd {{.*#+}} ymm3 = ymm4[1],ymm3[0],ymm4[2],ymm3[3]
-; X64-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm1[0,1],ymm0[0,1]
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3
+; X64-AVX1-NEXT:    vpermilpd {{.*#+}} ymm3 = ymm3[0,1,2,2]
+; X64-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm5
+; X64-AVX1-NEXT:    vshufpd {{.*#+}} ymm4 = ymm5[1],ymm4[0],ymm5[2],ymm4[3]
 ; X64-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1]
-; X64-AVX1-NEXT:    vshufpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[2]
-; X64-AVX1-NEXT:    vmovapd %ymm4, (%rdi)
-; X64-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; X64-AVX1-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2],ymm3[3]
-; X64-AVX1-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]
+; X64-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1]
+; X64-AVX1-NEXT:    vblendpd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3]
+; X64-AVX1-NEXT:    vmovapd %ymm3, (%rdi)
+; X64-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm4[2,3,0,1]
+; X64-AVX1-NEXT:    vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm0[2],ymm4[3]
+; X64-AVX1-NEXT:    vblendpd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3]
 ; X64-AVX1-NEXT:    vmovapd %ymm3, (%rsi)
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; X64-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]

From 0564dd904bf7ef7758cb904ed8f7f2a1f915ef8d Mon Sep 17 00:00:00 2001
From: Tobias Hieta <tobias@plex.tv>
Date: Fri, 29 Jan 2021 08:44:56 +0100
Subject: [PATCH 035/318] [OpenMP] Fix python3 compatibility in openmp's
 lit.cfg

Differential Revision: https://reviews.llvm.org/D95669

(cherry picked from commit c3c02d0d5a313272f6d35926bdf678fc6b884c02)
---
 openmp/runtime/test/lit.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openmp/runtime/test/lit.cfg b/openmp/runtime/test/lit.cfg
index 0d4a6107ff2b..c4e5fe1ea9e0 100644
--- a/openmp/runtime/test/lit.cfg
+++ b/openmp/runtime/test/lit.cfg
@@ -76,7 +76,7 @@ if config.operating_system == 'Darwin':
   cmd = subprocess.Popen(['xcrun', '--show-sdk-path'],
                          stdout=subprocess.PIPE, stderr=subprocess.PIPE)
   out, err = cmd.communicate()
-  out = out.strip()
+  out = out.strip().decode()
   res = cmd.wait()
   if res == 0 and out:
     config.test_flags += " -isysroot " + out

From e3658cefc5bc3538d05fc8ef058d83bcd24b785a Mon Sep 17 00:00:00 2001
From: "Kazushi (Jam) Marukawa" <marukawa@nec.com>
Date: Sat, 30 Jan 2021 12:34:06 +0900
Subject: [PATCH 036/318] [VE] Change inetger constants 32-bit friendly

Correct integer constants like `1UL << 63` to `UINT64_C(1) << 63` in
order to make them work on 32-bit machines.  Tested on both an i386
and x86_64 machines.

Reviewed By: mgorny

Differential Revision: https://reviews.llvm.org/D95724

(cherry picked from commit 4648098f97fa2a7c08c04632c70cf29293528812)
---
 llvm/lib/Target/VE/VE.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/VE/VE.h b/llvm/lib/Target/VE/VE.h
index a404f7ced70a..8c1fa840f19c 100644
--- a/llvm/lib/Target/VE/VE.h
+++ b/llvm/lib/Target/VE/VE.h
@@ -334,7 +334,7 @@ inline static bool isMImmVal(uint64_t Val) {
     return true;
   }
   // (m)1 patterns
-  return (Val & (1UL << 63)) && isShiftedMask_64(Val);
+  return (Val & (UINT64_C(1) << 63)) && isShiftedMask_64(Val);
 }
 
 inline static bool isMImm32Val(uint32_t Val) {
@@ -347,14 +347,14 @@ inline static bool isMImm32Val(uint32_t Val) {
     return true;
   }
   // (m)1 patterns
-  return (Val & (1 << 31)) && isShiftedMask_32(Val);
+  return (Val & (UINT32_C(1) << 31)) && isShiftedMask_32(Val);
 }
 
 /// val2MImm - Convert an integer immediate value to target MImm immediate.
 inline static uint64_t val2MImm(uint64_t Val) {
   if (Val == 0)
     return 0; // (0)1
-  if (Val & (1UL << 63))
+  if (Val & (UINT64_C(1) << 63))
     return countLeadingOnes(Val);       // (m)1
   return countLeadingZeros(Val) | 0x40; // (m)0
 }
@@ -364,8 +364,8 @@ inline static uint64_t mimm2Val(uint64_t Val) {
   if (Val == 0)
     return 0; // (0)1
   if ((Val & 0x40) == 0)
-    return (uint64_t)((1L << 63) >> (Val & 0x3f)); // (m)1
-  return ((uint64_t)(-1L) >> (Val & 0x3f));        // (m)0
+    return (uint64_t)((INT64_C(1) << 63) >> (Val & 0x3f)); // (m)1
+  return ((uint64_t)INT64_C(-1) >> (Val & 0x3f));          // (m)0
 }
 
 inline unsigned M0(unsigned Val) { return Val + 64; }

From b351efcae08a59c0cafa123a92b24c5f2300202b Mon Sep 17 00:00:00 2001
From: Nemanja Ivanovic <nemanja.i.ibm@gmail.com>
Date: Wed, 27 Jan 2021 23:08:39 -0600
Subject: [PATCH 037/318] [PowerPC] Do not emit XXSPLTI32DX for sub 64-bit
 constants

If the APInt returned by BuildVectorSDNode::isConstantSplat() is narrower than
64 bits, the result produced by XXSPLTI32DX is incorrect. The result returned
by the function appears to be incorrect and we'll investigate/fix it in a
follow-up commit. However, since this causes miscompiles, we must
temporarily disable emitting this instruction for such values.

(cherry picked from commit 54e570d94af995ff58287a8288389641910a8239)
---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp |  3 ++-
 llvm/test/CodeGen/PowerPC/p10-splatImm32.ll | 22 +++++++++++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 9215c17cb94b..663ee15db11e 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -8613,7 +8613,8 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
           PPCISD::XXSPLTI_SP_TO_DP, dl, MVT::v2f64,
           DAG.getTargetConstant(APSplatBits.getZExtValue(), dl, MVT::i32));
       return DAG.getBitcast(Op.getValueType(), SplatNode);
-    } else { // We may lose precision, so we have to use XXSPLTI32DX.
+    } else if (APSplatBits.getBitWidth() == 64) {
+      // We may lose precision, so we have to use XXSPLTI32DX.
 
       uint32_t Hi =
           (uint32_t)((APSplatBits.getZExtValue() & 0xFFFFFFFF00000000LL) >> 32);
diff --git a/llvm/test/CodeGen/PowerPC/p10-splatImm32.ll b/llvm/test/CodeGen/PowerPC/p10-splatImm32.ll
index 420a96dc1495..081cae729acf 100644
--- a/llvm/test/CodeGen/PowerPC/p10-splatImm32.ll
+++ b/llvm/test/CodeGen/PowerPC/p10-splatImm32.ll
@@ -100,3 +100,25 @@ define dso_local <8 x i16> @test_xxsplti32dx_9() {
 entry:
   ret <8 x i16> <i16 291, i16 undef, i16 undef, i16 364, i16 undef, i16 1, i16 173, i16 undef>
 }
+
+define dso_local <16 x i8> @test_xxsplti32dx_10() {
+; CHECK-LABEL: test_xxsplti32dx_10:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor vs34, vs34, vs34
+; CHECK-NEXT:    xxsplti32dx vs34, 0, 1207959552
+; CHECK-NEXT:    blr
+entry:
+  ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 72, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 72>
+}
+
+; FIXME: It appears that there is something wrong with the computation
+;        of the 64-bit constant to splat so we cannot emit xxsplti32dx for
+;        this test case for now.
+define dso_local <16 x i8> @constSplatBug() {
+; CHECK-LABEL: constSplatBug:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    plxv vs34, .LCPI10_0@PCREL(0), 1
+; CHECK-NEXT:    blr
+entry:
+  ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 71, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 71>
+}

From dfb763363bc560769605e37e96c1d13cb236223d Mon Sep 17 00:00:00 2001
From: Albion Fung <conanap@lep82435v.canlab.ibm.com>
Date: Thu, 28 Jan 2021 15:17:18 -0500
Subject: [PATCH 038/318] [PowerPC][Power10] Fix XXSPLI32DX not correctly
 exploiting specific cases

Some cases may be transformed into 32 bit splats before hitting the boolean statement, which may cause incorrect behaviour and provide XXSPLTI32DX with the incorrect values of splat. The condition was reversed so that the shortcut prevents this problem.

Differential Revision: https://reviews.llvm.org/D95634

(cherry picked from commit 2e470e03b49f1d79ebc315ca9d62a690a633c0cd)
---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 11 +++++++----
 llvm/test/CodeGen/PowerPC/p10-splatImm32.ll | 16 ++--------------
 2 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 663ee15db11e..929a72ac687e 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -8604,16 +8604,19 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
 
   // If it is a splat of a double, check if we can shrink it to a 32 bit
   // non-denormal float which when converted back to double gives us the same
-  // double. This is to exploit the XXSPLTIDP instruction.+  // If we lose precision, we use XXSPLTI32DX.
+  // double. This is to exploit the XXSPLTIDP instruction.
+  // If we lose precision, we use XXSPLTI32DX.
   if (BVNIsConstantSplat && (SplatBitSize == 64) &&
       Subtarget.hasPrefixInstrs()) {
-    if (convertToNonDenormSingle(APSplatBits) &&
-        (Op->getValueType(0) == MVT::v2f64)) {
+    // Check the type first to short-circuit so we don't modify APSplatBits if
+    // this block isn't executed.
+    if ((Op->getValueType(0) == MVT::v2f64) &&
+        convertToNonDenormSingle(APSplatBits)) {
       SDValue SplatNode = DAG.getNode(
           PPCISD::XXSPLTI_SP_TO_DP, dl, MVT::v2f64,
           DAG.getTargetConstant(APSplatBits.getZExtValue(), dl, MVT::i32));
       return DAG.getBitcast(Op.getValueType(), SplatNode);
-    } else if (APSplatBits.getBitWidth() == 64) {
+    } else {
       // We may lose precision, so we have to use XXSPLTI32DX.
 
       uint32_t Hi =
diff --git a/llvm/test/CodeGen/PowerPC/p10-splatImm32.ll b/llvm/test/CodeGen/PowerPC/p10-splatImm32.ll
index 081cae729acf..ce4c2da24b0d 100644
--- a/llvm/test/CodeGen/PowerPC/p10-splatImm32.ll
+++ b/llvm/test/CodeGen/PowerPC/p10-splatImm32.ll
@@ -101,23 +101,11 @@ entry:
   ret <8 x i16> <i16 291, i16 undef, i16 undef, i16 364, i16 undef, i16 1, i16 173, i16 undef>
 }
 
-define dso_local <16 x i8> @test_xxsplti32dx_10() {
-; CHECK-LABEL: test_xxsplti32dx_10:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xxlxor vs34, vs34, vs34
-; CHECK-NEXT:    xxsplti32dx vs34, 0, 1207959552
-; CHECK-NEXT:    blr
-entry:
-  ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 72, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 72>
-}
-
-; FIXME: It appears that there is something wrong with the computation
-;        of the 64-bit constant to splat so we cannot emit xxsplti32dx for
-;        this test case for now.
 define dso_local <16 x i8> @constSplatBug() {
 ; CHECK-LABEL: constSplatBug:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    plxv vs34, .LCPI10_0@PCREL(0), 1
+; CHECK-NEXT:    xxlxor vs34, vs34, vs34
+; CHECK-NEXT:    xxsplti32dx vs34, 0, 1191182336
 ; CHECK-NEXT:    blr
 entry:
   ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 71, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 71>

From 237b39a02f38b4903f39fef362d0f5e98e1de194 Mon Sep 17 00:00:00 2001
From: Hsiangkai Wang <kai.wang@sifive.com>
Date: Fri, 29 Jan 2021 21:59:49 +0800
Subject: [PATCH 039/318] [RISCV] Update the version number to v0.10 for
 vector.

v0.10 is tagged in V specification. Update the version to v0.10.

Differential Revision: https://reviews.llvm.org/D95680

(cherry picked from commit 282aca10aeb03bdaef0a8d4f3faa4c2ff236e527)
---
 clang/lib/Basic/Targets/RISCV.cpp              |  6 +++---
 clang/lib/Driver/ToolChains/Arch/RISCV.cpp     |  2 +-
 clang/test/Driver/riscv-arch.c                 |  6 +++---
 .../test/Preprocessor/riscv-target-features.c  | 18 +++++++++---------
 .../Target/RISCV/AsmParser/RISCVAsmParser.cpp  |  6 +++---
 .../RISCV/MCTargetDesc/RISCVTargetStreamer.cpp |  6 +++---
 llvm/lib/Target/RISCV/RISCVInstrInfoV.td       |  2 +-
 llvm/test/CodeGen/RISCV/attributes.ll          |  8 ++++----
 llvm/test/MC/RISCV/attribute-arch.s            |  8 ++++----
 9 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/clang/lib/Basic/Targets/RISCV.cpp b/clang/lib/Basic/Targets/RISCV.cpp
index 0bf02e605740..786201ea340d 100644
--- a/clang/lib/Basic/Targets/RISCV.cpp
+++ b/clang/lib/Basic/Targets/RISCV.cpp
@@ -150,7 +150,7 @@ void RISCVTargetInfo::getTargetDefines(const LangOptions &Opts,
   }
 
   if (HasV) {
-    Builder.defineMacro("__riscv_v", "1000000");
+    Builder.defineMacro("__riscv_v", "10000");
     Builder.defineMacro("__riscv_vector");
   }
 
@@ -191,10 +191,10 @@ void RISCVTargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__riscv_zfh", "1000");
 
   if (HasZvamo)
-    Builder.defineMacro("__riscv_zvamo", "1000000");
+    Builder.defineMacro("__riscv_zvamo", "10000");
 
   if (HasZvlsseg)
-    Builder.defineMacro("__riscv_zvlsseg", "1000000");
+    Builder.defineMacro("__riscv_zvlsseg", "10000");
 }
 
 /// Return true if has this feature, need to sync with handleTargetFeatures.
diff --git a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp
index ffae47e5672e..c7f2a3ea5e02 100644
--- a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp
@@ -63,7 +63,7 @@ isExperimentalExtension(StringRef Ext) {
       Ext == "zbr" || Ext == "zbs" || Ext == "zbt" || Ext == "zbproposedc")
     return RISCVExtensionVersion{"0", "93"};
   if (Ext == "v" || Ext == "zvamo" || Ext == "zvlsseg")
-    return RISCVExtensionVersion{"1", "0"};
+    return RISCVExtensionVersion{"0", "10"};
   if (Ext == "zfh")
     return RISCVExtensionVersion{"0", "1"};
   return None;
diff --git a/clang/test/Driver/riscv-arch.c b/clang/test/Driver/riscv-arch.c
index 3762a4aef1b3..cf148ca885d0 100644
--- a/clang/test/Driver/riscv-arch.c
+++ b/clang/test/Driver/riscv-arch.c
@@ -384,7 +384,7 @@
 // RV32-EXPERIMENTAL-V-BADVERS: error: invalid arch name 'rv32iv0p1'
 // RV32-EXPERIMENTAL-V-BADVERS: unsupported version number 0.1 for experimental extension
 
-// RUN: %clang -target riscv32-unknown-elf -march=rv32iv1p0 -menable-experimental-extensions -### %s -c 2>&1 | \
+// RUN: %clang -target riscv32-unknown-elf -march=rv32iv0p10 -menable-experimental-extensions -### %s -c 2>&1 | \
 // RUN:   FileCheck -check-prefix=RV32-EXPERIMENTAL-V-GOODVERS %s
 // RV32-EXPERIMENTAL-V-GOODVERS: "-target-feature" "+experimental-v"
 
@@ -412,7 +412,7 @@
 // RV32-EXPERIMENTAL-ZVAMO-BADVERS: error: invalid arch name 'rv32izvamo0p1'
 // RV32-EXPERIMENTAL-ZVAMO-BADVERS: unsupported version number 0.1 for experimental extension
 
-// RUN: %clang -target riscv32-unknown-elf -march=rv32izvamo1p0 -menable-experimental-extensions -### %s -c 2>&1 | \
+// RUN: %clang -target riscv32-unknown-elf -march=rv32izvamo0p10 -menable-experimental-extensions -### %s -c 2>&1 | \
 // RUN:   FileCheck -check-prefix=RV32-EXPERIMENTAL-ZVAMO-GOODVERS %s
 // RV32-EXPERIMENTAL-ZVAMO-GOODVERS: "-target-feature" "+experimental-zvamo"
 
@@ -431,6 +431,6 @@
 // RV32-EXPERIMENTAL-ZVLSSEG-BADVERS: error: invalid arch name 'rv32izvlsseg0p1'
 // RV32-EXPERIMENTAL-ZVLSSEG-BADVERS: unsupported version number 0.1 for experimental extension
 
-// RUN: %clang -target riscv32-unknown-elf -march=rv32izvlsseg1p0 -menable-experimental-extensions -### %s -c 2>&1 | \
+// RUN: %clang -target riscv32-unknown-elf -march=rv32izvlsseg0p10 -menable-experimental-extensions -### %s -c 2>&1 | \
 // RUN:   FileCheck -check-prefix=RV32-EXPERIMENTAL-ZVLSSEG-GOODVERS %s
 // RV32-EXPERIMENTAL-ZVLSSEG-GOODVERS: "-target-feature" "+experimental-zvlsseg"
diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c
index 006395505246..88826bbd60b8 100644
--- a/clang/test/Preprocessor/riscv-target-features.c
+++ b/clang/test/Preprocessor/riscv-target-features.c
@@ -110,23 +110,23 @@
 // CHECK-DOUBLE-NOT: __riscv_float_abi_single
 
 // RUN: %clang -target riscv32-unknown-linux-gnu -menable-experimental-extensions \
-// RUN:   -march=rv32iv1p0 -x c -E -dM %s \
+// RUN:   -march=rv32iv0p10 -x c -E -dM %s \
 // RUN:   -o - | FileCheck --check-prefix=CHECK-V-EXT %s
 // RUN: %clang -target riscv64-unknown-linux-gnu -menable-experimental-extensions \
-// RUN:   -march=rv64iv1p0 -x c -E -dM %s \
+// RUN:   -march=rv64iv0p10 -x c -E -dM %s \
 // RUN:   -o - | FileCheck --check-prefix=CHECK-V-EXT %s
-// RUN: %clang -target riscv32-unknown-linux-gnu -menable-experimental-extensions -march=rv32izvamo1p0 -x c -E -dM %s \
+// RUN: %clang -target riscv32-unknown-linux-gnu -menable-experimental-extensions -march=rv32izvamo0p10 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-V-EXT %s
-// RUN: %clang -target riscv64-unknown-linux-gnu -menable-experimental-extensions -march=rv32izvamo1p0 -x c -E -dM %s \
+// RUN: %clang -target riscv64-unknown-linux-gnu -menable-experimental-extensions -march=rv32izvamo0p10 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-V-EXT %s
-// RUN: %clang -target riscv32-unknown-linux-gnu -menable-experimental-extensions -march=rv32izvlsseg1p0 -x c -E -dM %s \
+// RUN: %clang -target riscv32-unknown-linux-gnu -menable-experimental-extensions -march=rv32izvlsseg0p10 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-V-EXT %s
-// RUN: %clang -target riscv64-unknown-linux-gnu -menable-experimental-extensions -march=rv32izvlsseg1p0 -x c -E -dM %s \
+// RUN: %clang -target riscv64-unknown-linux-gnu -menable-experimental-extensions -march=rv32izvlsseg0p10 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-V-EXT %s
-// CHECK-V-EXT: __riscv_v 1000000
+// CHECK-V-EXT: __riscv_v 10000
 // CHECK-V-EXT: __riscv_vector 1
-// CHECK-V-EXT: __riscv_zvamo 1000000
-// CHECK-V-EXT: __riscv_zvlsseg 1000000
+// CHECK-V-EXT: __riscv_zvamo 10000
+// CHECK-V-EXT: __riscv_zvlsseg 10000
 
 // RUN: %clang -target riscv32-unknown-linux-gnu -menable-experimental-extensions -march=rv32izba0p93 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZBA-EXT %s
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index e7e590153605..dcf7525d7458 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -2126,7 +2126,7 @@ bool RISCVAsmParser::parseDirectiveAttribute() {
       if (getFeatureBits(RISCV::FeatureStdExtB))
         formalArchStr = (Twine(formalArchStr) + "_b0p93").str();
       if (getFeatureBits(RISCV::FeatureStdExtV))
-        formalArchStr = (Twine(formalArchStr) + "_v1p0").str();
+        formalArchStr = (Twine(formalArchStr) + "_v0p10").str();
       if (getFeatureBits(RISCV::FeatureExtZfh))
         formalArchStr = (Twine(formalArchStr) + "_zfh0p1").str();
       if (getFeatureBits(RISCV::FeatureExtZba))
@@ -2152,9 +2152,9 @@ bool RISCVAsmParser::parseDirectiveAttribute() {
       if (getFeatureBits(RISCV::FeatureExtZbt))
         formalArchStr = (Twine(formalArchStr) + "_zbt0p93").str();
       if (getFeatureBits(RISCV::FeatureExtZvamo))
-        formalArchStr = (Twine(formalArchStr) + "_zvamo1p0").str();
+        formalArchStr = (Twine(formalArchStr) + "_zvamo0p10").str();
       if (getFeatureBits(RISCV::FeatureStdExtZvlsseg))
-        formalArchStr = (Twine(formalArchStr) + "_zvlsseg1p0").str();
+        formalArchStr = (Twine(formalArchStr) + "_zvlsseg0p10").str();
 
       getTargetStreamer().emitTextAttribute(Tag, formalArchStr);
     }
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
index 72434a15bedb..13c4b84aa300 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
@@ -63,7 +63,7 @@ void RISCVTargetStreamer::emitTargetAttributes(const MCSubtargetInfo &STI) {
   if (STI.hasFeature(RISCV::FeatureStdExtB))
     Arch += "_b0p93";
   if (STI.hasFeature(RISCV::FeatureStdExtV))
-    Arch += "_v1p0";
+    Arch += "_v0p10";
   if (STI.hasFeature(RISCV::FeatureExtZfh))
     Arch += "_zfh0p1";
   if (STI.hasFeature(RISCV::FeatureExtZba))
@@ -89,9 +89,9 @@ void RISCVTargetStreamer::emitTargetAttributes(const MCSubtargetInfo &STI) {
   if (STI.hasFeature(RISCV::FeatureExtZbt))
     Arch += "_zbt0p93";
   if (STI.hasFeature(RISCV::FeatureExtZvamo))
-    Arch += "_zvamo1p0";
+    Arch += "_zvamo0p10";
   if (STI.hasFeature(RISCV::FeatureStdExtZvlsseg))
-    Arch += "_zvlsseg1p0";
+    Arch += "_zvlsseg0p10";
 
   emitTextAttribute(RISCVAttrs::ARCH, Arch);
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
index 4f9e9cfbdb98..e02c9f8bcbe2 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// This file describes the RISC-V instructions from the standard 'V' Vector
-/// extension, version 0.9.
+/// extension, version 0.10.
 /// This version is still experimental as the 'V' extension hasn't been
 /// ratified yet.
 ///
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index a0943d5d4293..c26a6d5b4a69 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -47,7 +47,7 @@
 ; RV32D: .attribute 5, "rv32i2p0_f2p0_d2p0"
 ; RV32C: .attribute 5, "rv32i2p0_c2p0"
 ; RV32B: .attribute 5, "rv32i2p0_b0p93_zba0p93_zbb0p93_zbc0p93_zbe0p93_zbf0p93_zbm0p93_zbp0p93_zbr0p93_zbs0p93_zbt0p93"
-; RV32V: .attribute 5, "rv32i2p0_v1p0_zvamo1p0_zvlsseg1p0"
+; RV32V: .attribute 5, "rv32i2p0_v0p10_zvamo0p10_zvlsseg0p10"
 ; RV32ZFH: .attribute 5, "rv32i2p0_f2p0_zfh0p1"
 ; RV32ZBA: .attribute 5, "rv32i2p0_zba0p93"
 ; RV32ZBB: .attribute 5, "rv32i2p0_zbb0p93"
@@ -60,7 +60,7 @@
 ; RV32ZBR: .attribute 5, "rv32i2p0_zbr0p93"
 ; RV32ZBS: .attribute 5, "rv32i2p0_zbs0p93"
 ; RV32ZBT: .attribute 5, "rv32i2p0_zbt0p93"
-; RV32COMBINED: .attribute 5, "rv32i2p0_f2p0_v1p0_zfh0p1_zbb0p93_zvamo1p0_zvlsseg1p0"
+; RV32COMBINED: .attribute 5, "rv32i2p0_f2p0_v0p10_zfh0p1_zbb0p93_zvamo0p10_zvlsseg0p10"
 
 ; RV64M: .attribute 5, "rv64i2p0_m2p0"
 ; RV64A: .attribute 5, "rv64i2p0_a2p0"
@@ -80,8 +80,8 @@
 ; RV64ZBR: .attribute 5, "rv64i2p0_zbr0p93"
 ; RV64ZBS: .attribute 5, "rv64i2p0_zbs0p93"
 ; RV64ZBT: .attribute 5, "rv64i2p0_zbt0p93"
-; RV64V: .attribute 5, "rv64i2p0_v1p0_zvamo1p0_zvlsseg1p0"
-; RV64COMBINED: .attribute 5, "rv64i2p0_f2p0_v1p0_zfh0p1_zbb0p93_zvamo1p0_zvlsseg1p0"
+; RV64V: .attribute 5, "rv64i2p0_v0p10_zvamo0p10_zvlsseg0p10"
+; RV64COMBINED: .attribute 5, "rv64i2p0_f2p0_v0p10_zfh0p1_zbb0p93_zvamo0p10_zvlsseg0p10"
 
 
 define i32 @addi(i32 %a) {
diff --git a/llvm/test/MC/RISCV/attribute-arch.s b/llvm/test/MC/RISCV/attribute-arch.s
index 66d7ad576382..51d0c6ace9e1 100644
--- a/llvm/test/MC/RISCV/attribute-arch.s
+++ b/llvm/test/MC/RISCV/attribute-arch.s
@@ -40,7 +40,7 @@
 # CHECK: attribute      5, "rv32i2p0_b0p93_zba0p93_zbb0p93_zbc0p93_zbe0p93_zbf0p93_zbm0p93_zbp0p93_zbr0p93_zbs0p93_zbt0p93"
 
 .attribute arch, "rv32iv"
-# CHECK: attribute      5, "rv32i2p0_v1p0"
+# CHECK: attribute      5, "rv32i2p0_v0p10"
 
 .attribute arch, "rv32izba"
 # CHECK: attribute      5, "rv32i2p0_zba0p93"
@@ -79,7 +79,7 @@
 # CHECK: attribute      5, "rv32i2p0_f2p0_zfh0p1"
 
 .attribute arch, "rv32ivzvamo_zvlsseg"
-# CHECK: attribute      5, "rv32i2p0_v1p0_zvamo1p0_zvlsseg1p0"
+# CHECK: attribute      5, "rv32i2p0_v0p10_zvamo0p10_zvlsseg0p10"
 
-.attribute arch, "rv32iv_zvamo1p0_zvlsseg"
-# CHECK: attribute      5, "rv32i2p0_v1p0_zvamo1p0_zvlsseg1p0"
+.attribute arch, "rv32iv_zvamo0p10_zvlsseg"
+# CHECK: attribute      5, "rv32i2p0_v0p10_zvamo0p10_zvlsseg0p10"

From c738c8aa9bf387cc960feca81bc5263e8c634e15 Mon Sep 17 00:00:00 2001
From: Hsiangkai Wang <kai.wang@sifive.com>
Date: Sat, 30 Jan 2021 07:54:41 +0800
Subject: [PATCH 040/318] [RISCV]  Update the version number to v0.10 for
 vector.

(cherry picked from commit 9847023660467a4469b5667bcf7a4c73a4780037)
---
 llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td    | 2 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 06e4d053d5d7..9fdfc2727d86 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// This file contains the required infrastructure to support code generation
-/// for the standard 'V' (Vector) extension, version 0.9.  This version is still
+/// for the standard 'V' (Vector) extension, version 0.10.  This version is still
 /// experimental as the 'V' extension hasn't been ratified yet.
 ///
 /// This file is included from RISCVInstrInfoV.td
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
index aea3d0e17ccc..79a1e6ddc8a2 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
@@ -8,7 +8,7 @@
 ///
 /// This file contains the required infrastructure and SDNode patterns to
 /// support code generation for the standard 'V' (Vector) extension, version
-/// 0.9.  This version is still experimental as the 'V' extension hasn't been
+/// 0.10.  This version is still experimental as the 'V' extension hasn't been
 /// ratified yet.
 ///
 /// This file is included from and depends upon RISCVInstrInfoVPseudos.td

From c5904f5c9d32e563e2898e1242d5818e488fe2ee Mon Sep 17 00:00:00 2001
From: Cullen Rhodes <cullen.rhodes@arm.com>
Date: Sat, 16 Jan 2021 16:08:40 +0000
Subject: [PATCH 041/318] [LV] Fix crash when computing max VF too early

D90687 introduced a crash:

  llvm::LoopVectorizationCostModel::computeMaxVF(llvm::ElementCount, unsigned int):
    Assertion `WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
    "No decisions should have been taken at this point"' failed.

when compiling the following C code:

  typedef struct {
  char a;
  } b;

  b *c;
  int d, e;

  int f() {
    int g = 0;
    for (; d; d++) {
      e = 0;
      for (; e < c[d].a; e++)
        g++;
    }
    return g;
  }

with:

  clang -Os -target hexagon -mhvx -fvectorize -mv67 testcase.c -S -o -

This occurred since prior to D90687 computeFeasibleMaxVF would only be
called in computeMaxVF when a scalar epilogue was allowed, but now it's
always called. This causes the assert above since computeFeasibleMaxVF
collects all viable VFs larger than the default MaxVF, and for each VF
calculates the register usage which results in analysis being done the
assert above guards against. This can occur in computeFeasibleMaxVF if
TTI.shouldMaximizeVectorBandwidth and this target hook is implemented in
the hexagon backend to always return true.

Reported by @iajbar.

Reviewed By: fhahn

Differential Revision: https://reviews.llvm.org/D94869

(cherry picked from commit 8cda227432f1c9ceb63b88802ed8136da97274f1)
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  7 ++---
 .../LoopVectorize/Hexagon/maximum-vf-crash.ll | 29 +++++++++++++++++++
 2 files changed, 32 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/Hexagon/maximum-vf-crash.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ea0d7673edf6..47635dbdda02 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5504,11 +5504,9 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     return None;
   }
 
-  ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
-
   switch (ScalarEpilogueStatus) {
   case CM_ScalarEpilogueAllowed:
-    return MaxVF;
+    return computeFeasibleMaxVF(TC, UserVF);
   case CM_ScalarEpilogueNotAllowedUsePredicate:
     LLVM_FALLTHROUGH;
   case CM_ScalarEpilogueNotNeededUsePredicate:
@@ -5546,7 +5544,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
                            "scalar epilogue instead.\n");
       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
-      return MaxVF;
+      return computeFeasibleMaxVF(TC, UserVF);
     }
     return None;
   }
@@ -5563,6 +5561,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
   }
 
+  ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
   assert(!MaxVF.isScalable() &&
          "Scalable vectors do not yet support tail folding");
   assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) &&
diff --git a/llvm/test/Transforms/LoopVectorize/Hexagon/maximum-vf-crash.ll b/llvm/test/Transforms/LoopVectorize/Hexagon/maximum-vf-crash.ll
new file mode 100644
index 000000000000..5f8c5d329edf
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/Hexagon/maximum-vf-crash.ll
@@ -0,0 +1,29 @@
+; RUN: opt -march=hexagon -hexagon-autohvx -loop-vectorize -S < %s 2>&1 | FileCheck %s
+
+; Check that we don't crash.
+
+; CHECK-LABEL: @f
+; CHECK: vector.body
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+; Function Attrs: optsize
+define i32 @f() #0 {
+entry:
+  br label %loop
+
+loop:
+  %g.016 = phi i32 [ 0, %entry ], [ %g.1.lcssa, %loop ]
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %0 = load i8, i8* undef, align 1
+  %g.1.lcssa = add i32 %g.016, undef
+  %iv.next = add nsw i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, 0
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret i32 %g.1.lcssa
+}
+
+attributes #0 = { optsize "target-features"="+hvx-length128b" }

From b15f3fc5c71dc8a9db7e931e2922a065293e4a64 Mon Sep 17 00:00:00 2001
From: Andrew Ng <andrew.ng@sony.com>
Date: Wed, 27 Jan 2021 16:47:21 +0000
Subject: [PATCH 042/318] [X86] Fix disassembly of x86-64 GDTLS code sequence

For x86-64 the REX.w prefix takes precedence over any other size
override (i.e. 0x66). Therefore, for x86-64 when REX.w is present set
'hasOpSize' to false to ensure that any size override is ignored.

Fixes PR48901.

Differential Revision: https://reviews.llvm.org/D95682

(cherry picked from commit 94fedd266125a5425aa33e11332bf414f0b6dc35)
---
 .../X86/Disassembler/X86Disassembler.cpp      |  1 +
 llvm/test/MC/Disassembler/X86/x86-64.txt      | 10 +++++++---
 .../llvm-objdump/X86/disassemble-gdtls.s      | 19 +++++++++++++++++++
 3 files changed, 27 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/tools/llvm-objdump/X86/disassemble-gdtls.s

diff --git a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 05e482a6b66e..4e6d8e8e1a54 100644
--- a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -492,6 +492,7 @@ static int readPrefixes(struct InternalInstruction *insn) {
       insn->addressSize = (insn->hasAdSize ? 4 : 8);
       insn->displacementSize = 4;
       insn->immediateSize = 4;
+      insn->hasOpSize = false;
     } else {
       insn->registerSize = (insn->hasOpSize ? 2 : 4);
       insn->addressSize = (insn->hasAdSize ? 4 : 8);
diff --git a/llvm/test/MC/Disassembler/X86/x86-64.txt b/llvm/test/MC/Disassembler/X86/x86-64.txt
index d91ef2500d99..5e56d4c796e6 100644
--- a/llvm/test/MC/Disassembler/X86/x86-64.txt
+++ b/llvm/test/MC/Disassembler/X86/x86-64.txt
@@ -329,8 +329,10 @@
 # CHECK: callw 32767
 0x66 0xe8 0xff 0x7f
 
-# CHECK: callw 32767
-0x66 0x66 0x48 0xe8 0xff 0x7f
+# TODO: Should display data16 prefixes.
+# CHECK-NOT: data16
+# CHECK: callq 32767
+0x66 0x66 0x48 0xe8 0xff 0x7f 0x00 0x00
 
 # CHECK: jmp -32769
 0xe9 0xff 0x7f 0xff 0xff
@@ -338,8 +340,10 @@
 # CHECK: jmp 32767
 0x66 0xe9 0xff 0x7f
 
+# TODO: Should display data16 prefixes.
+# CHECK-NOT: data16
 # CHECK: jmp 32767
-0x66 0x66 0x48 0xe9 0xff 0x7f
+0x66 0x66 0x48 0xe9 0xff 0x7f 0x00 0x00
 
 # CHECK: jo -32769
 0x0f 0x80 0xff 0x7f 0xff 0xff
diff --git a/llvm/test/tools/llvm-objdump/X86/disassemble-gdtls.s b/llvm/test/tools/llvm-objdump/X86/disassemble-gdtls.s
new file mode 100644
index 000000000000..e913f5f6a345
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/X86/disassemble-gdtls.s
@@ -0,0 +1,19 @@
+# RUN: llvm-mc %s -filetype=obj -triple=x86_64 | llvm-objdump -d - | FileCheck %s
+
+# CHECK:      <PR48901>:
+# TODO: Should display data16 prefixes.
+# CHECK-NEXT: 0: 66 48 8d 3d 00 00 00 00       leaq    (%rip), %rdi  # 8 <PR48901+0x8>
+# CHECK-NEXT: 8: 66 66 48 e8 00 00 00 00       callq   0x10 <PR48901+0x10>
+# CHECK-EMPTY:
+
+PR48901:
+ data16
+ leaq   bar@TLSGD(%rip),%rdi
+ data16
+ data16
+ rex64
+ callq  __tls_get_addr@PLT
+
+.section .tdata,"awT",@progbits
+bar:
+.long 42

From e2d822c3bdf6388c6ef21f35745105aba064d16d Mon Sep 17 00:00:00 2001
From: Haowei Wu <haowei@google.com>
Date: Thu, 28 Jan 2021 14:13:20 -0800
Subject: [PATCH 043/318] [elfabi] Fix tests which failed on different
 timezones

This patch fixes elfabi tests on machines using a GMT+X timezone
settings.

Differential Revision: https://reviews.llvm.org/D95641

(cherry picked from commit 771b35965457ebd5faaed8a1c3d2bcefffe721a3)
---
 llvm/test/tools/llvm-elfabi/preserve-dates-stub.test | 4 ++--
 llvm/test/tools/llvm-elfabi/preserve-dates-tbe.test  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/test/tools/llvm-elfabi/preserve-dates-stub.test b/llvm/test/tools/llvm-elfabi/preserve-dates-stub.test
index c399029e0337..9742a61aa281 100644
--- a/llvm/test/tools/llvm-elfabi/preserve-dates-stub.test
+++ b/llvm/test/tools/llvm-elfabi/preserve-dates-stub.test
@@ -1,9 +1,9 @@
 ## Test writing unchanged content to ELF Stub file with --write-if-changed flag.
 
 # RUN: llvm-elfabi %s --output-target=elf64-little %t
-# RUN: touch -m -t 197001010000 %t
+# RUN: env TZ=GMT touch -m -t 197001010000 %t
 # RUN: llvm-elfabi %s --output-target=elf64-little %t --write-if-changed
-# RUN: ls -l %t | FileCheck %s
+# RUN: env TZ=GMT ls -l %t | FileCheck %s
 
 --- !tapi-tbe
 TbeVersion: 1.0
diff --git a/llvm/test/tools/llvm-elfabi/preserve-dates-tbe.test b/llvm/test/tools/llvm-elfabi/preserve-dates-tbe.test
index 89cad7733eee..3ec190067c73 100644
--- a/llvm/test/tools/llvm-elfabi/preserve-dates-tbe.test
+++ b/llvm/test/tools/llvm-elfabi/preserve-dates-tbe.test
@@ -1,8 +1,8 @@
 ## Test writing unchanged content to TBE file with --write-if-changed flag.
 
 # RUN: llvm-elfabi --elf %p/Inputs/gnu_hash.so --emit-tbe=%t
-# RUN: touch -m -t 197001010000 %t
+# RUN: env TZ=GMT touch -m -t 197001010000 %t
 # RUN: llvm-elfabi --elf %p/Inputs/gnu_hash.so --emit-tbe=%t --write-if-changed
-# RUN: ls -l %t | FileCheck %s
+# RUN: env TZ=GMT ls -l %t | FileCheck %s
 
 # CHECK: {{[[:space:]]1970}}

From 12b6579b79dc21e9e54e74520ece0d571a640d4b Mon Sep 17 00:00:00 2001
From: Atmn Patel <atmndp@gmail.com>
Date: Wed, 27 Jan 2021 18:49:41 -0500
Subject: [PATCH 044/318] [OpenMP][Libomptarget] Fix conditional in CMake for
 remote plugin

The remote offloading plugin's CMakeLists was trying to build if its
flag was enabled even if it didn't find gRPC/protobuf. The conditional
was wrong, it's fixed by this.

Differential Revision: https://reviews.llvm.org/D95574

(cherry picked from commit 8a77056256d9970387595a5c729d894e3fe07131)
---
 openmp/libomptarget/plugins/remote/CMakeLists.txt | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/openmp/libomptarget/plugins/remote/CMakeLists.txt b/openmp/libomptarget/plugins/remote/CMakeLists.txt
index 1baa1125f44c..989c74642c66 100644
--- a/openmp/libomptarget/plugins/remote/CMakeLists.txt
+++ b/openmp/libomptarget/plugins/remote/CMakeLists.txt
@@ -42,12 +42,13 @@ if (Protobuf_FOUND AND gRPC_FOUND AND PROTOC AND GRPC_CPP_PLUGIN)
   set(GRPC_INCLUDE_DIR
       ${directory}
   )
+
+  set(RPC_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include/)
+  set(RPC_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/lib/)
+  
+  add_subdirectory(src)
+  add_subdirectory(server)
 else()
   libomptarget_say("Not building remote offloading plugin: required libraries were not found.")
 endif()
 
-set(RPC_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include/)
-set(RPC_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/lib/)
-
-add_subdirectory(src)
-add_subdirectory(server)

From 4d0874c72a0a3f53eb3084a1ea3ee4456ab6e004 Mon Sep 17 00:00:00 2001
From: Shilei Tian <tianshilei1992@gmail.com>
Date: Thu, 28 Jan 2021 08:13:28 -0500
Subject: [PATCH 045/318] [OpenMP][NVPTX] Added the missing -O1 when building
 NVPTX bitcode libraries

In the past `-O1` was used when building NVPTX bitcode libraries. After
we switched to OpenMP, `-O1` was missing by mistake, leading to a huge performance
regression.

Reviewed By: JonChesterfield

Differential Revision: https://reviews.llvm.org/D95545

(cherry picked from commit 5a64794bbad4010778406dfee7748e6080258dbf)
---
 .../libomptarget/deviceRTLs/nvptx/CMakeLists.txt | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
index 23efbba29d66..eeda137ef120 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -126,14 +126,14 @@ set(cuda_src_files
 )
 
 # Set flags for LLVM Bitcode compilation.
-set(bc_flags -S -x c++
-              -target nvptx64
-              -Xclang -emit-llvm-bc
-              -Xclang -aux-triple -Xclang ${aux_triple}
-              -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
-              -D__CUDACC__
-              -I${devicertl_base_directory}
-              -I${devicertl_nvptx_directory}/src)
+set(bc_flags -S -x c++ -O1 -std=c++14
+             -target nvptx64
+             -Xclang -emit-llvm-bc
+             -Xclang -aux-triple -Xclang ${aux_triple}
+             -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
+             -D__CUDACC__
+             -I${devicertl_base_directory}
+             -I${devicertl_nvptx_directory}/src)
 
 if(${LIBOMPTARGET_NVPTX_DEBUG})
   list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=-1)

From 5d926bb3c46848c704833e0f02884395609388a3 Mon Sep 17 00:00:00 2001
From: Shilei Tian <tianshilei1992@gmail.com>
Date: Thu, 28 Jan 2021 08:12:39 -0500
Subject: [PATCH 046/318] [OpenMP][deviceRTLs] Added
 `[[clang::loader_uninitialized]]` explicitly

`[[clang::loader_uninitialized]]` is in macro `SHARED` but it doesn't
work for array like `parallelLevel`, so the variable will be zero initialized.
There is also a similar issue for `omptarget_nvptx_device_State` which is in
global address space. Its c'tor is also generated, which was not in the past when
building the `deviceRTLs` with CUDA. In this patch, we added the attribute to
the two variables explicitly.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D95550

(cherry picked from commit 19248d30e4ed5250fa84abbbd52fc7b835918a45)
---
 openmp/libomptarget/deviceRTLs/common/src/omp_data.cu | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
index b91afd7476fe..4736d07108e0 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
@@ -25,7 +25,8 @@ DEVICE omptarget_device_environmentTy omptarget_device_environment;
 // global data holding OpenMP state information
 ////////////////////////////////////////////////////////////////////////////////
 
-DEVICE
+// OpenMP will try to call its ctor if we don't add the attribute explicitly
+[[clang::loader_uninitialized]] DEVICE
     omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
         omptarget_nvptx_device_State[MAX_SM];
 
@@ -33,7 +34,9 @@ DEVICE omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager;
 DEVICE uint32_t SHARED(usedMemIdx);
 DEVICE uint32_t SHARED(usedSlotIdx);
 
-DEVICE uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
+// SHARED doesn't work with array so we add the attribute explicitly.
+[[clang::loader_uninitialized]] DEVICE uint8_t
+    parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
 #pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc)
 DEVICE uint16_t SHARED(threadLimit);
 DEVICE uint16_t SHARED(threadsInTeam);

From 255f7398845a7cfb47aef53e40b68057ec56839e Mon Sep 17 00:00:00 2001
From: Shilei Tian <tianshilei1992@gmail.com>
Date: Fri, 29 Jan 2021 13:12:47 -0500
Subject: [PATCH 047/318] [OpenMP][NFC] Added release note for new `deviceRTLs`
 and hidden helper task

Added release note for new `deviceRTLs` and hidden helper task for LLVM
12.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D95584

(cherry picked from commit 7bc31018f71cac22b7060c49cefb6f3d0d2e2069)
---
 openmp/docs/ReleaseNotes.rst | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/openmp/docs/ReleaseNotes.rst b/openmp/docs/ReleaseNotes.rst
index 7f40d3c81510..cb3464ad84f0 100644
--- a/openmp/docs/ReleaseNotes.rst
+++ b/openmp/docs/ReleaseNotes.rst
@@ -7,7 +7,7 @@ OpenMP 12.0.0 Release Notes
    These are in-progress notes for the upcoming LLVM 12.0.0 release.
    Release notes for previous releases can be found on
    `the Download Page <https://releases.llvm.org/download.html>`_.
-   
+
 
 Introduction
 ============
@@ -44,3 +44,27 @@ Non-comprehensive list of changes in this release
   ``LIBOMPTARGET_INFO`` allows the user to request certain information from the
   ``libomptarget`` runtime using a 32-bit field. A full description of each
   environment variable is described :ref:`here <libopenmptarget_environment_vars>`.
+
+- ``target nowait`` was supported via hidden helper task, which is a task not
+  bound to any parallel region. A hidden helper team with a number of threads is
+  created when the first hidden helper task is encountered. The number of threads
+  can be configured via the environment variable
+  ``LIBOMP_NUM_HIDDEN_HELPER_THREADS``. By default it is 8. If
+  ``LIBOMP_NUM_HIDDEN_HELPER_THREADS=0``, hidden helper task is disabled and
+  falls back to a regular OpenMP task. It can also be disabled by setting the
+  environment variable ``LIBOMP_USE_HIDDEN_HELPER_TASK=OFF``.
+
+- ``deviceRTLs`` for NVPTX platform is CUDA free now. It is generally OpenMP code.
+  Target dependent parts are implemented with Clang/LLVM/NVVM intrinsics. CUDA
+  SDK is also dropped as a dependence to build the device runtime, which means
+  device runtime can also be built on a CUDA free system. However, it is
+  disabled by default. Set the CMake variable
+  ``LIBOMPTARGET_BUILD_NVPTX_BCLIB=ON`` to enable the build of NVPTX device
+  runtime on a CUDA free system. ``gcc-multilib`` and ``g++-multilib`` are
+  required. If CUDA is found, the device runtime will be built by default.
+
+  - Static NVPTX device runtime library (``libomptarget-nvptx.a``) was dropped.
+  A bitcode library is required to build an OpenMP program. If the library is
+  not found in the default path or any of the paths defined by ``LIBRARY_PATH``,
+  an error will be raised. User can also specify the path to the bitcode device
+  library via ``--libomptarget-nvptx-bc-path=``.

From 922e4149d16754b54ce225faa3e769d32937d7ad Mon Sep 17 00:00:00 2001
From: Joseph Huber <jhuber6@vols.utk.edu>
Date: Mon, 1 Feb 2021 10:31:09 -0500
Subject: [PATCH 048/318] [OpenMP] Fix seg fault in libomptarget when using
 Info with multiple threads

Summary:
One option for the LIBOMPTARGET_INFO environment variable is to print the current status of the device's data mappings. These are a shared resource among threads so this needs to be protected when using multiple streams.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D95786

(cherry picked from commit fda48539988d2a1bdb6395799151e9090312a20b)
---
 openmp/libomptarget/src/interface.cpp | 4 ++--
 openmp/libomptarget/src/private.h     | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index cf6d36960c75..01f3715d6bcc 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -58,7 +58,7 @@ static void HandleTargetOutcome(bool success, ident_t *loc = nullptr) {
   case tgt_mandatory:
     if (!success) {
       if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE)
-        for (const auto &Device : PM->Devices)
+        for (auto &Device : PM->Devices)
           dumpTargetPointerMappings(loc, Device);
       else
         FAILURE_MESSAGE("Run with LIBOMPTARGET_DEBUG=%d to dump host-target "
@@ -76,7 +76,7 @@ static void HandleTargetOutcome(bool success, ident_t *loc = nullptr) {
           1, "failure of target construct while offloading is mandatory");
     } else {
       if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE)
-        for (const auto &Device : PM->Devices)
+        for (auto &Device : PM->Devices)
           dumpTargetPointerMappings(loc, Device);
     }
     break;
diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h
index fb6f681d3020..3b0e57dfe15e 100644
--- a/openmp/libomptarget/src/private.h
+++ b/openmp/libomptarget/src/private.h
@@ -99,7 +99,7 @@ int __kmpc_get_target_offload(void) __attribute__((weak));
 ////////////////////////////////////////////////////////////////////////////////
 /// dump a table of all the host-target pointer pairs on failure
 static inline void dumpTargetPointerMappings(const ident_t *Loc,
-                                             const DeviceTy &Device) {
+                                             DeviceTy &Device) {
   if (Device.HostDataToTargetMap.empty())
     return;
 
@@ -109,6 +109,7 @@ static inline void dumpTargetPointerMappings(const ident_t *Loc,
        Kernel.getFilename(), Kernel.getLine(), Kernel.getColumn());
   INFO(OMP_INFOTYPE_ALL, Device.DeviceID, "%-18s %-18s %s %s %s\n", "Host Ptr",
        "Target Ptr", "Size (B)", "RefCount", "Declaration");
+  Device.DataMapMtx.lock();
   for (const auto &HostTargetMap : Device.HostDataToTargetMap) {
     SourceInfo Info(HostTargetMap.HstPtrName);
     INFO(OMP_INFOTYPE_ALL, Device.DeviceID,
@@ -118,6 +119,7 @@ static inline void dumpTargetPointerMappings(const ident_t *Loc,
          HostTargetMap.getRefCount(), Info.getName(), Info.getFilename(),
          Info.getLine(), Info.getColumn());
   }
+  Device.DataMapMtx.unlock();
 }
 
 ////////////////////////////////////////////////////////////////////////////////

From 678c259d277135ef32861887a8ac8618deba5f24 Mon Sep 17 00:00:00 2001
From: Richard Smith <richard@metafoo.co.uk>
Date: Wed, 3 Feb 2021 14:57:19 -0800
Subject: [PATCH 049/318] PR44325 (and duplicates): don't issue
 -Wzero-as-null-pointer-constant when rewriting 'a < b' as '(a <=> b) < 0'.

It's pretty common for comparison category types to use a pointer or
pointer-to-member type as their '0' parameter.

(cherry picked from commit 1f06f41993b6363e6b2c4f22a13488a3e687f31b)
---
 clang/lib/Sema/Sema.cpp                       |  7 +++++++
 .../SemaCXX/cxx2a-three-way-comparison.cpp    | 20 ++++++++++++++++++-
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 55cb3aee6194..cb5a84a31235 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -537,6 +537,13 @@ void Sema::diagnoseZeroToNullptrConversion(CastKind Kind, const Expr* E) {
   if (E->IgnoreParenImpCasts()->getType()->isNullPtrType())
     return;
 
+  // Don't diagnose the conversion from a 0 literal to a null pointer argument
+  // in a synthesized call to operator<=>.
+  if (!CodeSynthesisContexts.empty() &&
+      CodeSynthesisContexts.back().Kind ==
+          CodeSynthesisContext::RewritingOperatorAsSpaceship)
+    return;
+
   // If it is a macro from system header, and if the macro name is not "NULL",
   // do not warn.
   SourceLocation MaybeMacroLoc = E->getBeginLoc();
diff --git a/clang/test/SemaCXX/cxx2a-three-way-comparison.cpp b/clang/test/SemaCXX/cxx2a-three-way-comparison.cpp
index 353360e052bb..b94225274fff 100644
--- a/clang/test/SemaCXX/cxx2a-three-way-comparison.cpp
+++ b/clang/test/SemaCXX/cxx2a-three-way-comparison.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++2a -verify %s
+// RUN: %clang_cc1 -std=c++2a -verify %s -Wzero-as-null-pointer-constant
 
 // Keep this test before any declarations of operator<=>.
 namespace PR44786 {
@@ -40,3 +40,21 @@ namespace PR47893 {
   int &f(...);
   int &r = f(A(), A());
 }
+
+namespace PR44325 {
+  struct cmp_cat {};
+  bool operator<(cmp_cat, void*);
+  bool operator>(cmp_cat, int cmp_cat::*);
+
+  struct X {};
+  cmp_cat operator<=>(X, X);
+
+  bool b1 = X() < X(); // no warning
+  bool b2 = X() > X(); // no warning
+
+  // FIXME: It's not clear whether warning here is useful, but we can't really
+  // tell that this is a comparison category in general. This is probably OK,
+  // as comparisons against zero are only really intended for use in the
+  // implicit rewrite rules, not for explicit use by programs.
+  bool c = cmp_cat() < 0; // expected-warning {{zero as null pointer constant}}
+}

From 2a917b70e770e2d25d96f91beebf2a3e52bb9e66 Mon Sep 17 00:00:00 2001
From: Stephen Kelly <steveire@gmail.com>
Date: Wed, 3 Feb 2021 23:04:12 +0000
Subject: [PATCH 050/318] Extend release notes for AST Matchers changes

---
 clang/docs/ReleaseNotes.rst | 38 +++++++++++++++++++++++++++++++------
 1 file changed, 32 insertions(+), 6 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index a34cd512ca59..9efd4c01f053 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -250,15 +250,41 @@ release of Clang. Users of the build system should adjust accordingly.
 AST Matchers
 ------------
 
-- The behavior of TK_IgnoreUnlessSpelledInSource with the traverse() matcher
-  has been changed to no longer match on template instantiations or on
+- The ``mapAnyOf()`` matcher was added. This allows convenient matching of
+  different AST nodes which have a compatible matcher API. For example,
+  ``mapAnyOf(ifStmt, forStmt).with(hasCondition(integerLiteral()))``
+  matches any ``IfStmt`` or ``ForStmt`` with a integer literal as the
+  condition.
+
+- The ``binaryOperation()`` matcher allows matching expressions which
+  appear like binary operators in the code, even if they are really
+  ``CXXOperatorCallExpr`` for example. It is based on the ``mapAnyOf()``
+  matcher functionality. The matcher API for the latter node has been
+  extended with ``hasLHS()`` etc to facilitate the abstraction.
+
+- Matcher API for ``CXXRewrittenBinaryOperator`` has been added. In addition
+  to explicit matching with the ``cxxRewrittenBinaryOperator()`` matcher, the
+  ``binaryOperation()`` matches on nodes of this type.
+
+- The behavior of ``TK_IgnoreUnlessSpelledInSource`` with the ``traverse()``
+  matcher has been changed to no longer match on template instantiations or on
   implicit nodes which are not spelled in the source.
 
-- The TK_IgnoreImplicitCastsAndParentheses traversal kind was removed. It
-  is recommended to use TK_IgnoreUnlessSpelledInSource instead.
+- The ``TK_IgnoreImplicitCastsAndParentheses`` traversal kind was removed. It
+  is recommended to use ``TK_IgnoreUnlessSpelledInSource`` instead.
 
-- The behavior of the forEach() matcher was changed to not internally ignore
-  implicit and parenthesis nodes.
+- The behavior of the ``forEach()`` matcher was changed to not internally
+  ignore implicit and parenthesis nodes.  This makes it consistent with
+  the ``has()`` matcher.  Uses of ``forEach()`` relying on the old behavior
+  can now use the  ``traverse()`` matcher or ``ignoringParenCasts()``.
+
+- Several AST Matchers have been changed to match based on the active
+  traversal mode.  For example, ``argumentCountIs()`` matches the number of
+  arguments written in the source, ignoring default arguments represented
+  by ``CXXDefaultArgExpr`` nodes.
+
+- Improvements in AST Matchers allow more matching of template declarations,
+  independent of their template instantations.
 
 clang-format
 ------------

From f5602e0bf31ab590da19fa357980a753dbfd666e Mon Sep 17 00:00:00 2001
From: Shilei Tian <tianshilei1992@gmail.com>
Date: Thu, 28 Jan 2021 07:24:19 -0500
Subject: [PATCH 051/318] [OpenMP] Disabled profiling in `libomp` by default to
 unblock link errors

Link error occurred when time profiling in libomp is enabled by default
because `libomp` is assumed to be a C library but the dependence on
`libLLVMSupport` for profiling is a C++ library. Currently the issue blocks all
OpenMP tests in Phabricator.

This patch set a new CMake option `OPENMP_ENABLE_LIBOMP_PROFILING` to
enable/disable the feature. By default it is disabled. Note that once time
profiling is enabled for `libomp`, it becomes a C++ library.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D95585

(cherry picked from commit c571b168349fdf22d1dc8b920bcffa3d5161f0a2)
---
 openmp/CMakeLists.txt                 |  6 ++++++
 openmp/docs/design/Runtimes.rst       |  5 ++++-
 openmp/runtime/CMakeLists.txt         |  6 +++---
 openmp/runtime/src/CMakeLists.txt     | 12 +++++++++++-
 openmp/runtime/src/kmp_config.h.cmake |  4 ++--
 openmp/runtime/src/kmp_runtime.cpp    |  6 +++---
 6 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/openmp/CMakeLists.txt b/openmp/CMakeLists.txt
index 67600bebdafb..4787d4b5a321 100644
--- a/openmp/CMakeLists.txt
+++ b/openmp/CMakeLists.txt
@@ -86,6 +86,12 @@ option(OPENMP_ENABLE_LIBOMPTARGET "Enable building libomptarget for offloading."
        ${ENABLE_LIBOMPTARGET})
 option(OPENMP_ENABLE_LIBOMPTARGET_PROFILING "Enable time profiling for libomptarget."
        ${ENABLE_LIBOMPTARGET})
+option(OPENMP_ENABLE_LIBOMP_PROFILING "Enable time profiling for libomp." OFF)
+
+# Build host runtime library, after LIBOMPTARGET variables are set since they are needed
+# to enable time profiling support in the OpenMP runtime.
+add_subdirectory(runtime)
+
 if (OPENMP_ENABLE_LIBOMPTARGET)
   # Check that the library can actually be built.
   if (APPLE OR WIN32)
diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst
index 016b88ba324b..ad36e43eccdc 100644
--- a/openmp/docs/design/Runtimes.rst
+++ b/openmp/docs/design/Runtimes.rst
@@ -48,7 +48,10 @@ similar to Clang's ``-ftime-trace`` option. This generates a JSON file based on
 `Speedscope App`_. Building this feature depends on the `LLVM Support Library`_
 for time trace output. Using this library is enabled by default when building
 using the CMake option ``OPENMP_ENABLE_LIBOMPTARGET_PROFILING``. The output will
-be saved to the filename specified by the environment variable.
+be saved to the filename specified by the environment variable. For multi-threaded
+applications, profiling in ``libomp`` is also needed. Setting the CMake option
+``OPENMP_ENABLE_LIBOMP_PROFILING=ON`` to enable the feature. Note that this will
+turn ``libomp`` into a C++ library.
 
 .. _`Chrome Tracing`: https://www.chromium.org/developers/how-tos/trace-event-profiling-tool
 
diff --git a/openmp/runtime/CMakeLists.txt b/openmp/runtime/CMakeLists.txt
index 9fdd04f41646..8828ff8ef455 100644
--- a/openmp/runtime/CMakeLists.txt
+++ b/openmp/runtime/CMakeLists.txt
@@ -34,7 +34,6 @@ if(${OPENMP_STANDALONE_BUILD})
   # Should assertions be enabled?  They are on by default.
   set(LIBOMP_ENABLE_ASSERTIONS TRUE CACHE BOOL
     "enable assertions?")
-  set(LIBOMPTARGET_PROFILING_SUPPORT FALSE)
 else() # Part of LLVM build
   # Determine the native architecture from LLVM.
   string(TOLOWER "${LLVM_TARGET_ARCH}" LIBOMP_NATIVE_ARCH)
@@ -66,10 +65,11 @@ else() # Part of LLVM build
     libomp_get_architecture(LIBOMP_ARCH)
   endif ()
   set(LIBOMP_ENABLE_ASSERTIONS ${LLVM_ENABLE_ASSERTIONS})
-  # Time profiling support
-  set(LIBOMPTARGET_PROFILING_SUPPORT ${OPENMP_ENABLE_LIBOMPTARGET_PROFILING})
 endif()
 
+# Time profiling support
+set(LIBOMP_PROFILING_SUPPORT ${OPENMP_ENABLE_LIBOMP_PROFILING})
+
 # FUJITSU A64FX is a special processor because its cache line size is 256.
 # We need to pass this information into kmp_config.h.
 if(LIBOMP_ARCH STREQUAL "aarch64")
diff --git a/openmp/runtime/src/CMakeLists.txt b/openmp/runtime/src/CMakeLists.txt
index 2e927df84f5c..822f9ca2b825 100644
--- a/openmp/runtime/src/CMakeLists.txt
+++ b/openmp/runtime/src/CMakeLists.txt
@@ -50,6 +50,14 @@ if(${LIBOMP_USE_HWLOC})
   include_directories(${LIBOMP_HWLOC_INSTALL_DIR}/include)
 endif()
 
+# Building with time profiling support requires LLVM directory includes.
+if(LIBOMP_PROFILING_SUPPORT)
+  include_directories(
+    ${LLVM_MAIN_INCLUDE_DIR}
+    ${LLVM_INCLUDE_DIR}
+  )
+endif()
+
 # Getting correct source files to build library
 set(LIBOMP_CXXFILES)
 set(LIBOMP_ASMFILES)
@@ -135,7 +143,7 @@ libomp_get_ldflags(LIBOMP_CONFIGURED_LDFLAGS)
 
 libomp_get_libflags(LIBOMP_CONFIGURED_LIBFLAGS)
 # Build libomp library. Add LLVMSupport dependency if building in-tree with libomptarget profiling enabled.
-if(OPENMP_STANDALONE_BUILD OR (NOT OPENMP_ENABLE_LIBOMPTARGET_PROFILING))
+if(OPENMP_STANDALONE_BUILD OR (NOT OPENMP_ENABLE_LIBOMP_PROFILING))
   add_library(omp ${LIBOMP_LIBRARY_KIND} ${LIBOMP_SOURCE_FILES})
   # Linking command will include libraries in LIBOMP_CONFIGURED_LIBFLAGS
   target_link_libraries(omp ${LIBOMP_CONFIGURED_LIBFLAGS} ${CMAKE_DL_LIBS})
@@ -144,6 +152,8 @@ else()
     LINK_LIBS ${LIBOMP_CONFIGURED_LIBFLAGS} ${CMAKE_DL_LIBS}
     LINK_COMPONENTS Support
     )
+  # libomp must be a C++ library such that it can link libLLVMSupport
+  set(LIBOMP_LINKER_LANGUAGE CXX)
 endif()
 
 set_target_properties(omp PROPERTIES
diff --git a/openmp/runtime/src/kmp_config.h.cmake b/openmp/runtime/src/kmp_config.h.cmake
index 3d682c690fc7..f6aee7197ee8 100644
--- a/openmp/runtime/src/kmp_config.h.cmake
+++ b/openmp/runtime/src/kmp_config.h.cmake
@@ -44,8 +44,8 @@
 #define OMPT_DEBUG LIBOMP_OMPT_DEBUG
 #cmakedefine01 LIBOMP_OMPT_SUPPORT
 #define OMPT_SUPPORT LIBOMP_OMPT_SUPPORT
-#cmakedefine01 LIBOMPTARGET_PROFILING_SUPPORT
-#define OMPTARGET_PROFILING_SUPPORT LIBOMPTARGET_PROFILING_SUPPORT
+#cmakedefine01 LIBOMP_PROFILING_SUPPORT
+#define OMP_PROFILING_SUPPORT LIBOMP_PROFILING_SUPPORT
 #cmakedefine01 LIBOMP_OMPT_OPTIONAL
 #define OMPT_OPTIONAL LIBOMP_OMPT_OPTIONAL
 #cmakedefine01 LIBOMP_USE_ADAPTIVE_LOCKS
diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp
index 4a0634d59cff..a6e32bd008e1 100644
--- a/openmp/runtime/src/kmp_runtime.cpp
+++ b/openmp/runtime/src/kmp_runtime.cpp
@@ -32,7 +32,7 @@
 #include "ompt-specific.h"
 #endif
 
-#if OMPTARGET_PROFILING_SUPPORT
+#if OMP_PROFILING_SUPPORT
 #include "llvm/Support/TimeProfiler.h"
 static char *ProfileTraceFile = nullptr;
 #endif
@@ -5740,7 +5740,7 @@ void __kmp_free_thread(kmp_info_t *this_th) {
 /* ------------------------------------------------------------------------ */
 
 void *__kmp_launch_thread(kmp_info_t *this_thr) {
-#if OMPTARGET_PROFILING_SUPPORT
+#if OMP_PROFILING_SUPPORT
   ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
   // TODO: add a configuration option for time granularity
   if (ProfileTraceFile)
@@ -5848,7 +5848,7 @@ void *__kmp_launch_thread(kmp_info_t *this_thr) {
   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
   KMP_MB();
 
-#if OMPTARGET_PROFILING_SUPPORT
+#if OMP_PROFILING_SUPPORT
   llvm::timeTraceProfilerFinishThread();
 #endif
   return this_thr;

From 7d096f9bb350429628c6befce8f94dba4bbc6db9 Mon Sep 17 00:00:00 2001
From: Hongtao Yu <hoy@fb.com>
Date: Wed, 27 Jan 2021 16:04:11 -0800
Subject: [PATCH 052/318] [CSSPGO] Support of CS profiles in extended binary
 format.

This change brings up support of context-sensitive profiles in the format of extended binary. Existing sample profile reader/writer/merger code is being tweaked to reflect the fact of bracketed input contexts, like (`[...]`). The paired brackets are also needed in extbinary profiles because we don't yet have an otherwise good way to tell calling contexts apart from regular function names since the context delimiter `@` can somehow serve as a part of the C++ mangled names.

Reviewed By: wmi, wenlei

Differential Revision: https://reviews.llvm.org/D95547

(cherry picked from commit 7e99bddfeaab2713a8bb6ca538da25b66e6efc59)
---
 llvm/include/llvm/ProfileData/SampleProf.h    | 19 ++--
 .../llvm/ProfileData/SampleProfReader.h       |  4 +
 llvm/lib/ProfileData/SampleProfReader.cpp     | 86 ++++++++++---------
 llvm/lib/ProfileData/SampleProfWriter.cpp     |  4 +-
 .../Transforms/IPO/SampleContextTracker.cpp   |  2 +-
 .../SampleProfile/profile-context-tracker.ll  |  4 +
 .../llvm-profdata/Inputs/cs-sample.proftext   | 36 ++++++++
 .../llvm-profdata/cs-sample-profile.test      |  4 +
 llvm/tools/llvm-profdata/llvm-profdata.cpp    |  2 +-
 llvm/tools/llvm-profgen/ProfileGenerator.cpp  |  2 +-
 10 files changed, 113 insertions(+), 50 deletions(-)
 create mode 100644 llvm/test/tools/llvm-profdata/Inputs/cs-sample.proftext
 create mode 100644 llvm/test/tools/llvm-profdata/cs-sample-profile.test

diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h
index c45ace9e68c1..346bc4c81d86 100644
--- a/llvm/include/llvm/ProfileData/SampleProf.h
+++ b/llvm/include/llvm/ProfileData/SampleProf.h
@@ -439,9 +439,11 @@ class SampleContext {
   void clearState(ContextStateMask S) { State &= (uint32_t)~S; }
   bool hasContext() const { return State != UnknownContext; }
   bool isBaseContext() const { return CallingContext.empty(); }
-  StringRef getName() const { return Name; }
+  StringRef getNameWithoutContext() const { return Name; }
   StringRef getCallingContext() const { return CallingContext; }
-  StringRef getNameWithContext() const { return FullContext; }
+  StringRef getNameWithContext(bool WithBracket = false) const {
+    return WithBracket ? InputContext : FullContext;
+  }
 
 private:
   // Give a context string, decode and populate internal states like
@@ -449,6 +451,7 @@ class SampleContext {
   // `ContextStr`: `[main:3 @ _Z5funcAi:1 @ _Z8funcLeafi]`
   void setContext(StringRef ContextStr, ContextStateMask CState) {
     assert(!ContextStr.empty());
+    InputContext = ContextStr;
     // Note that `[]` wrapped input indicates a full context string, otherwise
     // it's treated as context-less function name only.
     bool HasContext = ContextStr.startswith("[");
@@ -480,6 +483,9 @@ class SampleContext {
     }
   }
 
+  // Input context string including bracketed calling context and leaf function
+  // name
+  StringRef InputContext;
   // Full context string including calling context and leaf function name
   StringRef FullContext;
   // Function name for the associated sample profile
@@ -676,7 +682,8 @@ class FunctionSamples {
     Name = Other.getName();
     if (!GUIDToFuncNameMap)
       GUIDToFuncNameMap = Other.GUIDToFuncNameMap;
-
+    if (Context.getNameWithContext(true).empty())
+      Context = Other.getContext();
     if (FunctionHash == 0) {
       // Set the function hash code for the target profile.
       FunctionHash = Other.getFunctionHash();
@@ -743,8 +750,10 @@ class FunctionSamples {
   StringRef getName() const { return Name; }
 
   /// Return function name with context.
-  StringRef getNameWithContext() const {
-    return FunctionSamples::ProfileIsCS ? Context.getNameWithContext() : Name;
+  StringRef getNameWithContext(bool WithBracket = false) const {
+    return FunctionSamples::ProfileIsCS
+               ? Context.getNameWithContext(WithBracket)
+               : Name;
   }
 
   /// Return the original function name.
diff --git a/llvm/include/llvm/ProfileData/SampleProfReader.h b/llvm/include/llvm/ProfileData/SampleProfReader.h
index 3f52a2f6163b..999e75eddffa 100644
--- a/llvm/include/llvm/ProfileData/SampleProfReader.h
+++ b/llvm/include/llvm/ProfileData/SampleProfReader.h
@@ -488,8 +488,12 @@ class SampleProfileReader {
   /// \brief Whether samples are collected based on pseudo probes.
   bool ProfileIsProbeBased = false;
 
+  /// Whether function profiles are context-sensitive.
   bool ProfileIsCS = false;
 
+  /// Number of context-sensitive profiles.
+  uint32_t CSProfileCount = 0;
+
   /// \brief The format of sample.
   SampleProfileFormat Format = SPF_None;
 };
diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp
index c42931174bc0..c9f41687c356 100644
--- a/llvm/lib/ProfileData/SampleProfReader.cpp
+++ b/llvm/lib/ProfileData/SampleProfReader.cpp
@@ -222,8 +222,6 @@ std::error_code SampleProfileReaderText::readImpl() {
   sampleprof_error Result = sampleprof_error::success;
 
   InlineCallStack InlineStack;
-  int CSProfileCount = 0;
-  int RegularProfileCount = 0;
   uint32_t ProbeProfileCount = 0;
 
   // SeenMetadata tracks whether we have processed metadata for the current
@@ -257,11 +255,9 @@ std::error_code SampleProfileReaderText::readImpl() {
       SampleContext FContext(FName);
       if (FContext.hasContext())
         ++CSProfileCount;
-      else
-        ++RegularProfileCount;
       Profiles[FContext] = FunctionSamples();
       FunctionSamples &FProfile = Profiles[FContext];
-      FProfile.setName(FContext.getName());
+      FProfile.setName(FContext.getNameWithoutContext());
       FProfile.setContext(FContext);
       MergeResult(Result, FProfile.addTotalSamples(NumSamples));
       MergeResult(Result, FProfile.addHeadSamples(NumHeadSamples));
@@ -324,13 +320,14 @@ std::error_code SampleProfileReaderText::readImpl() {
     }
   }
 
-  assert((RegularProfileCount == 0 || CSProfileCount == 0) &&
+  assert((CSProfileCount == 0 || CSProfileCount == Profiles.size()) &&
          "Cannot have both context-sensitive and regular profile");
   ProfileIsCS = (CSProfileCount > 0);
   assert((ProbeProfileCount == 0 || ProbeProfileCount == Profiles.size()) &&
          "Cannot have both probe-based profiles and regular profiles");
   ProfileIsProbeBased = (ProbeProfileCount > 0);
   FunctionSamples::ProfileIsProbeBased = ProfileIsProbeBased;
+  FunctionSamples::ProfileIsCS = ProfileIsCS;
 
   if (Result == sampleprof_error::success)
     computeSummary();
@@ -546,12 +543,16 @@ SampleProfileReaderBinary::readFuncProfile(const uint8_t *Start) {
   if (std::error_code EC = FName.getError())
     return EC;
 
-  Profiles[*FName] = FunctionSamples();
-  FunctionSamples &FProfile = Profiles[*FName];
-  FProfile.setName(*FName);
-
+  SampleContext FContext(*FName);
+  Profiles[FContext] = FunctionSamples();
+  FunctionSamples &FProfile = Profiles[FContext];
+  FProfile.setName(FContext.getNameWithoutContext());
+  FProfile.setContext(FContext);
   FProfile.addHeadSamples(*NumHeadSamples);
 
+  if (FContext.hasContext())
+    CSProfileCount++;
+
   if (std::error_code EC = readProfile(FProfile))
     return EC;
   return sampleprof_error::success;
@@ -654,40 +655,44 @@ std::error_code SampleProfileReaderExtBinaryBase::readFuncProfiles() {
         return EC;
     }
     assert(Data == End && "More data is read than expected");
-    return sampleprof_error::success;
-  }
-
-  if (Remapper) {
-    for (auto Name : FuncsToUse) {
-      Remapper->insert(Name);
+  } else {
+    if (Remapper) {
+      for (auto Name : FuncsToUse) {
+        Remapper->insert(Name);
+      }
     }
-  }
 
-  if (useMD5()) {
-    for (auto Name : FuncsToUse) {
-      auto GUID = std::to_string(MD5Hash(Name));
-      auto iter = FuncOffsetTable.find(StringRef(GUID));
-      if (iter == FuncOffsetTable.end())
-        continue;
-      const uint8_t *FuncProfileAddr = Start + iter->second;
-      assert(FuncProfileAddr < End && "out of LBRProfile section");
-      if (std::error_code EC = readFuncProfile(FuncProfileAddr))
-        return EC;
-    }
-  } else {
-    for (auto NameOffset : FuncOffsetTable) {
-      auto FuncName = NameOffset.first;
-      if (!FuncsToUse.count(FuncName) &&
-          (!Remapper || !Remapper->exist(FuncName)))
-        continue;
-      const uint8_t *FuncProfileAddr = Start + NameOffset.second;
-      assert(FuncProfileAddr < End && "out of LBRProfile section");
-      if (std::error_code EC = readFuncProfile(FuncProfileAddr))
-        return EC;
+    if (useMD5()) {
+      for (auto Name : FuncsToUse) {
+        auto GUID = std::to_string(MD5Hash(Name));
+        auto iter = FuncOffsetTable.find(StringRef(GUID));
+        if (iter == FuncOffsetTable.end())
+          continue;
+        const uint8_t *FuncProfileAddr = Start + iter->second;
+        assert(FuncProfileAddr < End && "out of LBRProfile section");
+        if (std::error_code EC = readFuncProfile(FuncProfileAddr))
+          return EC;
+      }
+    } else {
+      for (auto NameOffset : FuncOffsetTable) {
+        SampleContext FContext(NameOffset.first);
+        auto FuncName = FContext.getNameWithoutContext();
+        if (!FuncsToUse.count(FuncName) &&
+            (!Remapper || !Remapper->exist(FuncName)))
+          continue;
+        const uint8_t *FuncProfileAddr = Start + NameOffset.second;
+        assert(FuncProfileAddr < End && "out of LBRProfile section");
+        if (std::error_code EC = readFuncProfile(FuncProfileAddr))
+          return EC;
+      }
     }
+    Data = End;
   }
 
-  Data = End;
+  assert((CSProfileCount == 0 || CSProfileCount == Profiles.size()) &&
+         "Cannot have both context-sensitive and regular profile");
+  ProfileIsCS = (CSProfileCount > 0);
+  FunctionSamples::ProfileIsCS = ProfileIsCS;
   return sampleprof_error::success;
 }
 
@@ -887,7 +892,8 @@ std::error_code SampleProfileReaderExtBinaryBase::readFuncMetadata() {
     if (std::error_code EC = Checksum.getError())
       return EC;
 
-    Profiles[*FName].setFunctionHash(*Checksum);
+    SampleContext FContext(*FName);
+    Profiles[FContext].setFunctionHash(*Checksum);
   }
   return sampleprof_error::success;
 }
diff --git a/llvm/lib/ProfileData/SampleProfWriter.cpp b/llvm/lib/ProfileData/SampleProfWriter.cpp
index 71dba6281f76..d3bc05e06fdf 100644
--- a/llvm/lib/ProfileData/SampleProfWriter.cpp
+++ b/llvm/lib/ProfileData/SampleProfWriter.cpp
@@ -147,7 +147,7 @@ std::error_code SampleProfileWriterExtBinaryBase::write(
 std::error_code
 SampleProfileWriterExtBinaryBase::writeSample(const FunctionSamples &S) {
   uint64_t Offset = OutputStream->tell();
-  StringRef Name = S.getName();
+  StringRef Name = S.getNameWithContext(true);
   FuncOffsetTable[Name] = Offset - SecLBRProfileStart;
   encodeULEB128(S.getHeadSamples(), *OutputStream);
   return writeBody(S);
@@ -635,7 +635,7 @@ std::error_code SampleProfileWriterBinary::writeSummary() {
 std::error_code SampleProfileWriterBinary::writeBody(const FunctionSamples &S) {
   auto &OS = *OutputStream;
 
-  if (std::error_code EC = writeNameIdx(S.getName()))
+  if (std::error_code EC = writeNameIdx(S.getNameWithContext(true)))
     return EC;
 
   encodeULEB128(S.getTotalSamples(), OS);
diff --git a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
index 37fc27e91100..660d79de667c 100644
--- a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
+++ b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
@@ -179,7 +179,7 @@ SampleContextTracker::SampleContextTracker(
     SampleContext Context(FuncSample.first(), RawContext);
     LLVM_DEBUG(dbgs() << "Tracking Context for function: " << Context << "\n");
     if (!Context.isBaseContext())
-      FuncToCtxtProfileSet[Context.getName()].insert(FSamples);
+      FuncToCtxtProfileSet[Context.getNameWithoutContext()].insert(FSamples);
     ContextTrieNode *NewNode = getOrCreateContextPath(Context, true);
     assert(!NewNode->getFunctionSamples() &&
            "New node can't have sample profile");
diff --git a/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll b/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll
index ed32c2a0027b..adda7022047d 100644
--- a/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll
+++ b/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll
@@ -1,18 +1,22 @@
 ; Test for CSSPGO's SampleContextTracker to make sure context profile tree is promoted and merged properly
 ; based on inline decision, so post inline counts are accurate.
 
+; RUN: llvm-profdata merge --sample --extbinary %S/Inputs/profile-context-tracker.prof -o %t
+
 ; Note that we need new pass manager to enable top-down processing for sample profile loader
 ; Testwe we inlined the following in top-down order and entry counts accurate reflects post-inline base profile
 ;   main:3 @ _Z5funcAi
 ;   main:3 @ _Z5funcAi:1 @ _Z8funcLeafi
 ;   _Z5funcBi:1 @ _Z8funcLeafi
 ; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%t -sample-profile-inline-size -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL
 
 ; Testwe we inlined the following in top-down order and entry counts accurate reflects post-inline base profile
 ;   main:3 @ _Z5funcAi
 ;   _Z5funcAi:1 @ _Z8funcLeafi
 ;   _Z5funcBi:1 @ _Z8funcLeafi
 ; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-HOT
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%t -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-HOT
 
 
 @factor = dso_local global i32 3, align 4, !dbg !0
diff --git a/llvm/test/tools/llvm-profdata/Inputs/cs-sample.proftext b/llvm/test/tools/llvm-profdata/Inputs/cs-sample.proftext
new file mode 100644
index 000000000000..eead4d4d62f0
--- /dev/null
+++ b/llvm/test/tools/llvm-profdata/Inputs/cs-sample.proftext
@@ -0,0 +1,36 @@
+[main:3 @ _Z5funcAi:1 @ _Z8funcLeafi]:1467299:11
+ 0: 6
+ 1: 6
+ 3: 287884
+ 4: 287864 _Z3fibi:315608
+ 15: 23
+[main:3.1 @ _Z5funcBi:1 @ _Z8funcLeafi]:500853:20
+ 0: 15
+ 1: 15
+ 3: 74946
+ 4: 74941 _Z3fibi:82359
+ 10: 23324
+ 11: 23327 _Z3fibi:25228
+ 15: 11
+[main]:154:0
+ 2: 12
+ 3: 18 _Z5funcAi:11
+ 3.1: 18 _Z5funcBi:19
+[external:12 @ main]:154:12
+ 2: 12
+ 3: 10 _Z5funcAi:7
+ 3.1: 10 _Z5funcBi:11
+[main:3.1 @ _Z5funcBi]:120:19
+ 0: 19
+ 1: 19 _Z8funcLeafi:20
+ 3: 12
+[externalA:17 @ _Z5funcBi]:120:3
+ 0: 3
+ 1: 3
+[external:10 @ _Z5funcBi]:120:10
+ 0: 10
+ 1: 10
+[main:3 @ _Z5funcAi]:99:11
+ 0: 10
+ 1: 10 _Z8funcLeafi:11
+ 3: 24
diff --git a/llvm/test/tools/llvm-profdata/cs-sample-profile.test b/llvm/test/tools/llvm-profdata/cs-sample-profile.test
new file mode 100644
index 000000000000..04c573ddece3
--- /dev/null
+++ b/llvm/test/tools/llvm-profdata/cs-sample-profile.test
@@ -0,0 +1,4 @@
+RUN: llvm-profdata merge --sample --text -output=%t.proftext %S/Inputs/cs-sample.proftext
+RUN: diff -b %t.proftext %S/Inputs/cs-sample.proftext
+RUN: llvm-profdata merge --sample --extbinary %p/Inputs/cs-sample.proftext -o %t.prof && llvm-profdata merge --sample --text %t.prof -o %t1.proftext
+RUN: diff -b %t1.proftext %S/Inputs/cs-sample.proftext
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 8dc43924c067..7e53c30c7579 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -696,7 +696,7 @@ mergeSampleProfile(const WeightedFileVector &Inputs, SymbolRemapper *Remapper,
           Remapper ? remapSamples(I->second, *Remapper, Result)
                    : FunctionSamples();
       FunctionSamples &Samples = Remapper ? Remapped : I->second;
-      StringRef FName = Samples.getName();
+      StringRef FName = Samples.getNameWithContext(true);
       MergeResult(Result, ProfileMap[FName].merge(Samples, Input.Weight));
       if (Result != sampleprof_error::success) {
         std::error_code EC = make_error_code(Result);
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
index 265beccb84a8..7624fd3f2808 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -164,7 +164,7 @@ CSProfileGenerator::getFunctionProfileForContext(StringRef ContextStr) {
   if (Ret.second) {
     SampleContext FContext(Ret.first->first(), RawContext);
     FunctionSamples &FProfile = Ret.first->second;
-    FProfile.setName(FContext.getName());
+    FProfile.setName(FContext.getNameWithoutContext());
     FProfile.setContext(FContext);
   }
   return Ret.first->second;

From f2cabaac9525ba4b86301136e21ec9aad6aaf326 Mon Sep 17 00:00:00 2001
From: Hongtao Yu <hoy@fb.com>
Date: Sun, 31 Jan 2021 22:31:51 -0800
Subject: [PATCH 053/318] [CSSPGO] Tweaking inlining with pseudo probes.

Fixing up a couple places where `getCallSiteIdentifier` is needed to support pseudo-probe-based callsites.

Also fixing an issue in the extbinary profile reader where the metadata section is not fully scanned based on the number of profiles loaded only for the current module.

Reviewed By: wmi, wenlei

Differential Revision: https://reviews.llvm.org/D95791

(cherry picked from commit 224fee8219bb3aed34f13ce40935e1b3ede90a0f)
---
 llvm/lib/ProfileData/SampleProfReader.cpp     |   9 +-
 .../Transforms/IPO/SampleContextTracker.cpp   |  11 +-
 .../Inputs/pseudo-probe-inline.prof           |  18 ++
 .../SampleProfile/pseudo-probe-inline.ll      | 175 ++++++++++++++++++
 4 files changed, 204 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-inline.prof
 create mode 100644 llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll

diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp
index c9f41687c356..370ffc8e2885 100644
--- a/llvm/lib/ProfileData/SampleProfReader.cpp
+++ b/llvm/lib/ProfileData/SampleProfReader.cpp
@@ -883,7 +883,7 @@ std::error_code SampleProfileReaderExtBinaryBase::readNameTableSec(bool IsMD5) {
 std::error_code SampleProfileReaderExtBinaryBase::readFuncMetadata() {
   if (!ProfileIsProbeBased)
     return sampleprof_error::success;
-  for (unsigned I = 0; I < Profiles.size(); ++I) {
+  while (Data < End) {
     auto FName(readStringFromTable());
     if (std::error_code EC = FName.getError())
       return EC;
@@ -893,8 +893,13 @@ std::error_code SampleProfileReaderExtBinaryBase::readFuncMetadata() {
       return EC;
 
     SampleContext FContext(*FName);
-    Profiles[FContext].setFunctionHash(*Checksum);
+    // No need to load metadata for profiles that are not loaded in the current
+    // module.
+    if (Profiles.count(FContext))
+      Profiles[FContext].setFunctionHash(*Checksum);
   }
+
+  assert(Data == End && "More data is read than expected");
   return sampleprof_error::success;
 }
 
diff --git a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
index 660d79de667c..fad72985dedd 100644
--- a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
+++ b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
@@ -308,8 +308,7 @@ void SampleContextTracker::promoteMergeContextSamplesTree(
     return;
 
   // Get the context that needs to be promoted
-  LineLocation CallSite(FunctionSamples::getOffset(DIL),
-                        DIL->getBaseDiscriminator());
+  LineLocation CallSite = FunctionSamples::getCallSiteIdentifier(DIL);
   ContextTrieNode *NodeToPromo =
       CallerNode->getChildContext(CallSite, CalleeName);
   if (!NodeToPromo)
@@ -370,9 +369,7 @@ SampleContextTracker::getCalleeContextFor(const DILocation *DIL,
     return nullptr;
 
   return CallContext->getChildContext(
-      LineLocation(FunctionSamples::getOffset(DIL),
-                   DIL->getBaseDiscriminator()),
-      CalleeName);
+      FunctionSamples::getCallSiteIdentifier(DIL), CalleeName);
 }
 
 ContextTrieNode *SampleContextTracker::getContextFor(const DILocation *DIL) {
@@ -386,8 +383,8 @@ ContextTrieNode *SampleContextTracker::getContextFor(const DILocation *DIL) {
     if (Name.empty())
       Name = PrevDIL->getScope()->getSubprogram()->getName();
     S.push_back(
-        std::make_pair(LineLocation(FunctionSamples::getOffset(DIL),
-                                    DIL->getBaseDiscriminator()), Name));
+        std::make_pair(FunctionSamples::getCallSiteIdentifier(DIL),
+                       PrevDIL->getScope()->getSubprogram()->getLinkageName()));
     PrevDIL = DIL;
   }
 
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-inline.prof b/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-inline.prof
new file mode 100644
index 000000000000..fd3ff773e85d
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-inline.prof
@@ -0,0 +1,18 @@
+[foo]:23:23
+ 1: 23
+ 2: 23 zen:23
+ !CFGChecksum: 281479271677951
+[foo:2 @ zen]:765858:23
+ 1: 23
+ 2: 382920
+ 3: 382915
+ !CFGChecksum: 138828622701
+[bar]:23:23
+ 1: 23
+ 2: 23 zen:23
+ !CFGChecksum: 281479271677951
+[bar:2 @ zen]:765858:23
+ 1: 23
+ 2: 382920
+ 3: 382915
+ !CFGChecksum: 138828622701
\ No newline at end of file
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll
new file mode 100644
index 000000000000..a5033a0dc190
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll
@@ -0,0 +1,175 @@
+; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-inline.prof -S -pass-remarks=sample-profile -pass-remarks-output=%t.opt.yaml 2>&1 | FileCheck %s
+; RUN: FileCheck %s -check-prefix=YAML < %t.opt.yaml
+
+; RUN: llvm-profdata merge --sample --extbinary %S/Inputs/pseudo-probe-inline.prof -o %t2
+; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-file=%t2 -S -pass-remarks=sample-profile -pass-remarks-output=%t2.opt.yaml 2>&1 | FileCheck %s
+; RUN: FileCheck %s -check-prefix=YAML < %t2.opt.yaml
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@factor = dso_local global i32 3, align 4
+
+define dso_local i32 @foo(i32 %x) #0 !dbg !12 {
+entry:
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID1:]], i64 1, i32 0)
+  %add = add nsw i32 %x, 100000, !dbg !19
+;; Check zen is fully inlined so there's no call to zen anymore.
+;; Check code from the inlining of zen is properly annotated here.
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2:]], i64 1, i32 0)
+; CHECK: br i1 %cmp.i, label %while.cond.i, label %while.cond2.i, !dbg ![[#]], !prof ![[PD1:[0-9]+]]
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 2, i32 0)
+; CHECK: br i1 %cmp1.i, label %while.body.i, label %zen.exit, !dbg ![[#]], !prof ![[PD2:[0-9]+]]
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 3, i32 0)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 4, i32 0)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 5, i32 0)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 6, i32 0)
+; CHECK-NOT: call i32 @zen
+  %call = call i32 @zen(i32 %add), !dbg !20
+  ret i32 %call, !dbg !21
+}
+
+; CHECK: define dso_local i32 @zen
+define dso_local i32 @zen(i32 %x) #0 !dbg !22 {
+entry:
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 1, i32 0)
+  %cmp = icmp sgt i32 %x, 0, !dbg !26
+  br i1 %cmp, label %while.cond, label %while.cond2, !dbg !28
+
+while.cond:
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 2, i32 0)
+  %x.addr.0 = phi i32 [ %x, %entry ], [ %sub, %while.body ]
+  %cmp1 = icmp sgt i32 %x.addr.0, 0, !dbg !29
+  br i1 %cmp1, label %while.body, label %if.end, !dbg !31
+
+while.body:
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 3, i32 0)
+  %0 = load volatile i32, i32* @factor, align 4, !dbg !32
+  %sub = sub nsw i32 %x.addr.0, %0, !dbg !39
+  br label %while.cond, !dbg !31
+
+while.cond2:
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 4, i32 0)
+  %x.addr.1 = phi i32 [ %x, %entry ], [ %add, %while.body4 ]
+  %cmp3 = icmp slt i32 %x.addr.1, 0, !dbg !42
+  br i1 %cmp3, label %while.body4, label %if.end, !dbg !44
+
+while.body4:
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 5, i32 0)
+  %1 = load volatile i32, i32* @factor, align 4, !dbg !45
+  %add = add nsw i32 %x.addr.1, %1, !dbg !48
+  br label %while.cond2, !dbg !44
+
+if.end:
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 6, i32 0)
+  %x.addr.2 = phi i32 [ %x.addr.0, %while.cond ], [ %x.addr.1, %while.cond2 ]
+  ret i32 %x.addr.2, !dbg !51
+}
+
+; CHECK: !llvm.pseudo_probe_desc = !{![[#DESC0:]], ![[#DESC1:]]}
+; CHECK: ![[#DESC0]] = !{i64 [[#GUID1]], i64 [[#HASH1:]], !"foo"}
+; CHECK: ![[#DESC1]] = !{i64 [[#GUID2]], i64 [[#HASH2:]], !"zen"}
+; CHECK: ![[PD1]] = !{!"branch_weights", i32 25, i32 1}
+; CHECK: ![[PD2]] = !{!"branch_weights", i32 382916, i32 25}
+
+; Checking to see if YAML file is generated and contains remarks
+;YAML: --- !Passed
+;YAML-NEXT:  Pass:            sample-profile-inline
+;YAML-NEXT:  Name:            Inlined
+;YAML-NEXT:  DebugLoc:        { File: test.cpp, Line: 10, Column: 11 }
+;YAML-NEXT:  Function:        foo
+;YAML-NEXT:  Args:
+;YAML-NEXT:    - Callee:          zen
+;YAML-NEXT:      DebugLoc:        { File: test.cpp, Line: 38, Column: 0 }
+;YAML-NEXT:    - String:          ' inlined into '
+;YAML-NEXT:    - Caller:          foo
+;YAML-NEXT:      DebugLoc:        { File: test.cpp, Line: 9, Column: 0 }
+;YAML-NEXT:    - String:          ' to match profiling context'
+;YAML-NEXT:    - String:          ' with '
+;YAML-NEXT:    - String:          '(cost='
+;YAML-NEXT:    - Cost:            '15'
+;YAML-NEXT:    - String:          ', threshold='
+;YAML-NEXT:    - Threshold:       '225'
+;YAML-NEXT:    - String:          ')'
+;YAML-NEXT:    - String:          ' at callsite '
+;YAML-NEXT:    - String:          foo
+;YAML-NEXT:    - String:          ':'
+;YAML-NEXT:    - Line:            '1'
+;YAML-NEXT:    - String:          ':'
+;YAML-NEXT:    - Column:          '11'
+;YAML-NEXT:    - String:          ';'
+;YAML-NEXT:  ...
+;YAML:  --- !Analysis
+;YAML-NEXT:  Pass:            sample-profile
+;YAML-NEXT:  Name:            AppliedSamples
+;YAML-NEXT:  DebugLoc:        { File: test.cpp, Line: 10, Column: 22 }
+;YAML-NEXT:  Function:        foo
+;YAML-NEXT:  Args:
+;YAML-NEXT:    - String:          'Applied '
+;YAML-NEXT:    - NumSamples:      '23'
+;YAML-NEXT:    - String:          ' samples from profile (ProbeId='
+;YAML-NEXT:    - ProbeId:         '1'
+;YAML-NEXT:    - String:          ')'
+;YAML-NEXT:  ...
+;YAML:  --- !Analysis
+;YAML-NEXT:  Pass:            sample-profile
+;YAML-NEXT:  Name:            AppliedSamples
+;YAML-NEXT:  DebugLoc:        { File: test.cpp, Line: 39, Column: 9 }
+;YAML-NEXT:  Function:        foo
+;YAML-NEXT:  Args:
+;YAML-NEXT:    - String:          'Applied '
+;YAML-NEXT:    - NumSamples:      '23'
+;YAML-NEXT:    - String:          ' samples from profile (ProbeId='
+;YAML-NEXT:    - ProbeId:         '1'
+;YAML-NEXT:    - String:          ')'
+;YAML-NEXT:  ...
+;YAML:  --- !Analysis
+;YAML-NEXT:  Pass:            sample-profile
+;YAML-NEXT:  Name:            AppliedSamples
+;YAML-NEXT:  DebugLoc:        { File: test.cpp, Line: 41, Column: 14 }
+;YAML-NEXT:  Function:        foo
+;YAML-NEXT:  Args:
+;YAML-NEXT:    - String:          'Applied '
+;YAML-NEXT:    - NumSamples:      '382920'
+;YAML-NEXT:    - String:          ' samples from profile (ProbeId='
+;YAML-NEXT:    - ProbeId:         '2'
+;YAML-NEXT:    - String:          ')'
+;YAML-NEXT:  ...
+
+attributes #0 = {"use-sample-profile"}
+
+!llvm.module.flags = !{!8, !9}
+
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3)
+!3 = !DIFile(filename: "test.cpp", directory: "test")
+!4 = !{}
+!7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!8 = !{i32 7, !"Dwarf Version", i32 4}
+!9 = !{i32 2, !"Debug Info Version", i32 3}
+!12 = distinct !DISubprogram(name: "foo", scope: !3, file: !3, line: 9, type: !13, scopeLine: 9, unit: !2)
+!13 = !DISubroutineType(types: !14)
+!14 = !{!7, !7}
+!18 = !DILocation(line: 0, scope: !12)
+!19 = !DILocation(line: 10, column: 22, scope: !12)
+!20 = !DILocation(line: 10, column: 11, scope: !12)
+!21 = !DILocation(line: 12, column: 3, scope: !12)
+!22 = distinct !DISubprogram(name: "zen", scope: !3, file: !3, line: 37, type: !13, scopeLine: 38, unit: !2)
+!25 = !DILocation(line: 0, scope: !22)
+!26 = !DILocation(line: 39, column: 9, scope: !27)
+!27 = distinct !DILexicalBlock(scope: !22, file: !3, line: 39, column: 7)
+!28 = !DILocation(line: 39, column: 7, scope: !22)
+!29 = !DILocation(line: 41, column: 14, scope: !30)
+!30 = distinct !DILexicalBlock(scope: !27, file: !3, line: 39, column: 14)
+!31 = !DILocation(line: 41, column: 5, scope: !30)
+!32 = !DILocation(line: 42, column: 16, scope: !33)
+!33 = distinct !DILexicalBlock(scope: !30, file: !3, line: 41, column: 19)
+!38 = !DILocation(line: 42, column: 12, scope: !33)
+!39 = !DILocation(line: 42, column: 9, scope: !33)
+!42 = !DILocation(line: 48, column: 14, scope: !43)
+!43 = distinct !DILexicalBlock(scope: !27, file: !3, line: 46, column: 8)
+!44 = !DILocation(line: 48, column: 5, scope: !43)
+!45 = !DILocation(line: 49, column: 16, scope: !46)
+!46 = distinct !DILexicalBlock(scope: !43, file: !3, line: 48, column: 19)
+!47 = !DILocation(line: 49, column: 12, scope: !46)
+!48 = !DILocation(line: 49, column: 9, scope: !46)
+!51 = !DILocation(line: 53, column: 3, scope: !22)

From b9fa16f2234edddf6e0f449a0e7b646ee9046cf3 Mon Sep 17 00:00:00 2001
From: Hongtao Yu <hoy@fb.com>
Date: Fri, 22 Jan 2021 15:52:46 -0800
Subject: [PATCH 054/318] [CSSPGO] Passing the clang driver switch
 -fpseudo-probe-for-profiling to the linker.

As titled.

Reviewed By: wmi, wenlei

Differential Revision: https://reviews.llvm.org/D95271

(cherry picked from commit d3e2e3740d0730cb6788c771bb01a8f3e935bf2e)
---
 clang/include/clang/Driver/Options.td      |  2 +-
 clang/lib/Driver/ToolChains/CommonArgs.cpp |  5 +++++
 clang/test/Driver/pseudo-probe-lto.c       | 10 ++++++++++
 3 files changed, 16 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/Driver/pseudo-probe-lto.c

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 42c5319041d0..1f6c13d5cc96 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1147,7 +1147,7 @@ def fprofile_update_EQ : Joined<["-"], "fprofile-update=">,
 defm pseudo_probe_for_profiling : BoolFOption<"pseudo-probe-for-profiling",
   CodeGenOpts<"PseudoProbeForProfiling">, DefaultFalse,
   PosFlag<SetTrue, [], "Emit">, NegFlag<SetFalse, [], "Do not emit">,
-  BothFlags<[NoXarchOption, CC1Option], " pseudo probes for sample profiler">>;
+  BothFlags<[NoXarchOption, CC1Option], " pseudo probes for sample profiling">>;
 def forder_file_instrumentation : Flag<["-"], "forder-file-instrumentation">,
     Group<f_Group>, Flags<[CC1Option, CoreOption]>,
     HelpText<"Generate instrumented code to collect order file into default.profraw file (overridden by '=' form of option or LLVM_PROFILE_FILE env var)">;
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 6a95aa5ec628..bcaea71dca94 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -605,6 +605,11 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args,
       CmdArgs.push_back("-plugin-opt=new-pass-manager");
   }
 
+  // Pass an option to enable pseudo probe emission.
+  if (Args.hasFlag(options::OPT_fpseudo_probe_for_profiling,
+                   options::OPT_fno_pseudo_probe_for_profiling, false))
+    CmdArgs.push_back("-plugin-opt=pseudo-probe-for-profiling");
+
   // Setup statistics file output.
   SmallString<128> StatsFile = getStatsFileName(Args, Output, Input, D);
   if (!StatsFile.empty())
diff --git a/clang/test/Driver/pseudo-probe-lto.c b/clang/test/Driver/pseudo-probe-lto.c
new file mode 100644
index 000000000000..e319b8c0098b
--- /dev/null
+++ b/clang/test/Driver/pseudo-probe-lto.c
@@ -0,0 +1,10 @@
+// RUN: touch %t.o
+// RUN: %clang -### %t.o -target x86_64-unknown-linux -flto -fpseudo-probe-for-profiling 2>&1 | FileCheck %s --check-prefix=PROBE
+// RUN: %clang -### %t.o -target x86_64-unknown-linux -flto=thin -fpseudo-probe-for-profiling 2>&1 | FileCheck %s --check-prefix=PROBE
+// RUN: %clang -### %t.o -target x86_64-unknown-linux -flto -fno-pseudo-probe-for-profiling -fpseudo-probe-for-profiling 2>&1 | FileCheck %s --check-prefix=PROBE
+// RUN: %clang -### %t.o -target x86_64-unknown-linux -flto 2>&1 | FileCheck %s --check-prefix=NOPROBE
+// RUN: %clang -### %t.o -target x86_64-unknown-linux -flto -fno-pseudo-probe-for-profiling 2>&1 | FileCheck %s --check-prefix=NOPROBE
+// RUN: %clang -### %t.o -target x86_64-unknown-linux -flto -fpseudo-probe-for-profiling -fno-pseudo-probe-for-profiling 2>&1 | FileCheck %s --check-prefix=NOPROBE
+
+// PROBE: -plugin-opt=pseudo-probe-for-profiling
+// NOPROBE-NOT: -plugin-opt=pseudo-probe-for-profiling

From 27ff658e97528540e4425c0cb6400f3e5355f53a Mon Sep 17 00:00:00 2001
From: Wenlei He <aktoon@gmail.com>
Date: Sun, 3 Jan 2021 16:43:06 -0800
Subject: [PATCH 055/318] [CSSPGO] Call site prioritized inlining for sample
 PGO

This change implemented call site prioritized BFS profile guided inlining for sample profile loader. The new inlining strategy maximize the benefit of context-sensitive profile as mentioned in the follow up discussion of CSSPGO RFC. The change will not affect today's AutoFDO as it's opt-in. CSSPGO now defaults to the new FDO inliner, but can fall back to today's replay inliner using a switch (`-sample-profile-prioritized-inline=0`).

Motivation

With baseline AutoFDO, the inliner in sample profile loader only replays previous inlining, and the use of profile is only for pruning previous inlining that turned out to be cold. Due to the nature of replay, the FDO inliner is simple with hotness being the only decision factor. It has the following limitations that we're improving now for CSSPGO.
 - It doesn't take inline candidate size into account. Since it's doing replay, the size growth is bounded by previous CGSCC inlining. With context-sensitive profile, FDO inliner is no longer limited by previous inlining, so we need to take size into account to avoid significant size bloat.
 - The way it looks at hotness is not accurate. It uses total samples in an inlinee as proxy for hotness, while what really matters for an inline decision is the call site count. This is an unfortunate fall back because call site count and callee entry count are not reliable due to dwarf based correlation, especially for inlinees. Now paired with pseudo-probe, we have accurate call site count and callee's entry count, so we can use that to gauge hotness more accurately.
 - It treats all call sites from a block as hot as long as there's one call site considered hot. This is normally true, but since total samples is used as hotness proxy, this transitiveness within block magnifies the inacurate hotness heuristic. With pseduo-probe and the change above, this is no longer an issue for CSSPGO.

New FDO Inliner

Putting all the requirement for CSSPGO together, we need a top-down call site prioritized BFS inliner. Here're reasons why each component is needed.
 - Top-down: We need a top-down inliner to better leverage context-sensitive profile, so inlining is driven by accurate context profile, and post-inline is also accurate. This is already implemented in https://reviews.llvm.org/D70655.
 - Size Cap: For top-down inliner, taking function size into account for inline decision alone isn't sufficient to control size growth. We also need to explicitly cap size growth because with top-down inlining, we can grow inliner size significantly with large number of smaller inlinees even if each individually passes the cost/size check.
 - Prioritize call sites: With size cap, inlining order also becomes important, because if we stop inlining due to size budget limit, we'd want to use budget towards the most beneficial call sites.
 - BFS inline: Same as call site prioritization, if we stop inlining due to size budget limit, we want a balanced inline tree, rather than going deep on one call path.

Note that the new inliner avoids repeatedly evaluating same set of call site, so it should help with compile time too. For this reason, we could transition today's FDO inliner to use a queue with equal priority to avoid wasted reevaluation of same call site (TODO).

Speculative indirect call promotion and inlining is also supported now with CSSPGO just like baseline AutoFDO.

Tunings and knobs

I created tuning knobs for size growth/cap control, and for hot threshold separate from CGSCC inliner. The default values are selected based on initial tuning with CSSPGO.

Results

Evaluated with an internal LLVM fork couple months ago, plus another change to adjust hot-threshold cutoff for context profile (will send up after this one), the new inliner show ~1% geomean perf win on spec2006 with CSSPGO, while reducing code size too. The measurement was done using train-train setup, MonoLTO w/ new pass manager and pseudo-probe. Note that this is just a starting point - we hope that the new inliner will open up more opportunity with CSSPGO, but it will certainly take more time and effort to make it fully calibrated and ready for bigger workloads (we're working on it).

Differential Revision: https://reviews.llvm.org/D94001

(cherry picked from commit 6bae5973c476e16dbbc82030d65c7859a6628e89)
---
 .../Transforms/IPO/SampleContextTracker.h     |   6 +-
 .../Transforms/IPO/SampleContextTracker.cpp   |  73 ++-
 llvm/lib/Transforms/IPO/SampleProfile.cpp     | 443 ++++++++++++++++--
 .../Inputs/indirect-call-csspgo.prof          |  10 +
 .../SampleProfile/csspgo-inline-debug.ll      | 166 +++++++
 .../SampleProfile/csspgo-inline-icall.ll      |  63 +++
 .../Transforms/SampleProfile/csspgo-inline.ll | 180 +++++++
 .../profile-context-tracker-debug.ll          |  25 +-
 .../SampleProfile/profile-context-tracker.ll  |  15 +-
 .../SampleProfile/pseudo-probe-inline.ll      |   4 +-
 10 files changed, 904 insertions(+), 81 deletions(-)
 create mode 100644 llvm/test/Transforms/SampleProfile/Inputs/indirect-call-csspgo.prof
 create mode 100644 llvm/test/Transforms/SampleProfile/csspgo-inline-debug.ll
 create mode 100644 llvm/test/Transforms/SampleProfile/csspgo-inline-icall.ll
 create mode 100644 llvm/test/Transforms/SampleProfile/csspgo-inline.ll

diff --git a/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h b/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h
index 5b2600144fa3..526e141838c4 100644
--- a/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h
+++ b/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h
@@ -23,6 +23,7 @@
 #include "llvm/ProfileData/SampleProf.h"
 #include <list>
 #include <map>
+#include <vector>
 
 using namespace llvm;
 using namespace sampleprof;
@@ -42,7 +43,7 @@ class ContextTrieNode {
         CallSiteLoc(CallLoc){};
   ContextTrieNode *getChildContext(const LineLocation &CallSite,
                                    StringRef CalleeName);
-  ContextTrieNode *getChildContext(const LineLocation &CallSite);
+  ContextTrieNode *getHottestChildContext(const LineLocation &CallSite);
   ContextTrieNode *getOrCreateChildContext(const LineLocation &CallSite,
                                            StringRef CalleeName,
                                            bool AllowCreate = true);
@@ -94,6 +95,9 @@ class SampleContextTracker {
   // call-site. The full context is identified by location of call instruction.
   FunctionSamples *getCalleeContextSamplesFor(const CallBase &Inst,
                                               StringRef CalleeName);
+  // Get samples for indirect call targets for call site at given location.
+  std::vector<const FunctionSamples *>
+  getIndirectCalleeContextSamplesFor(const DILocation *DIL);
   // Query context profile for a given location. The full context
   // is identified by input DILocation.
   FunctionSamples *getContextSamplesFor(const DILocation *DIL);
diff --git a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
index fad72985dedd..41d7f363e1a4 100644
--- a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
+++ b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
@@ -30,7 +30,7 @@ namespace llvm {
 ContextTrieNode *ContextTrieNode::getChildContext(const LineLocation &CallSite,
                                                   StringRef CalleeName) {
   if (CalleeName.empty())
-    return getChildContext(CallSite);
+    return getHottestChildContext(CallSite);
 
   uint32_t Hash = nodeHash(CalleeName, CallSite);
   auto It = AllChildContext.find(Hash);
@@ -40,18 +40,22 @@ ContextTrieNode *ContextTrieNode::getChildContext(const LineLocation &CallSite,
 }
 
 ContextTrieNode *
-ContextTrieNode::getChildContext(const LineLocation &CallSite) {
+ContextTrieNode::getHottestChildContext(const LineLocation &CallSite) {
   // CSFDO-TODO: This could be slow, change AllChildContext so we can
   // do point look up for child node by call site alone.
-  // CSFDO-TODO: Return the child with max count for indirect call
+  // Retrieve the child node with max count for indirect call
   ContextTrieNode *ChildNodeRet = nullptr;
+  uint64_t MaxCalleeSamples = 0;
   for (auto &It : AllChildContext) {
     ContextTrieNode &ChildNode = It.second;
-    if (ChildNode.CallSiteLoc == CallSite) {
-      if (ChildNodeRet)
-        return nullptr;
-      else
-        ChildNodeRet = &ChildNode;
+    if (ChildNode.CallSiteLoc != CallSite)
+      continue;
+    FunctionSamples *Samples = ChildNode.getFunctionSamples();
+    if (!Samples)
+      continue;
+    if (Samples->getTotalSamples() > MaxCalleeSamples) {
+      ChildNodeRet = &ChildNode;
+      MaxCalleeSamples = Samples->getTotalSamples();
     }
   }
 
@@ -191,12 +195,12 @@ FunctionSamples *
 SampleContextTracker::getCalleeContextSamplesFor(const CallBase &Inst,
                                                  StringRef CalleeName) {
   LLVM_DEBUG(dbgs() << "Getting callee context for instr: " << Inst << "\n");
-  // CSFDO-TODO: We use CalleeName to differentiate indirect call
-  // We need to get sample for indirect callee too.
   DILocation *DIL = Inst.getDebugLoc();
   if (!DIL)
     return nullptr;
 
+  // For indirect call, CalleeName will be empty, in which case the context
+  // profile for callee with largest total samples will be returned.
   ContextTrieNode *CalleeContext = getCalleeContextFor(DIL, CalleeName);
   if (CalleeContext) {
     FunctionSamples *FSamples = CalleeContext->getFunctionSamples();
@@ -209,6 +213,26 @@ SampleContextTracker::getCalleeContextSamplesFor(const CallBase &Inst,
   return nullptr;
 }
 
+std::vector<const FunctionSamples *>
+SampleContextTracker::getIndirectCalleeContextSamplesFor(
+    const DILocation *DIL) {
+  std::vector<const FunctionSamples *> R;
+  if (!DIL)
+    return R;
+
+  ContextTrieNode *CallerNode = getContextFor(DIL);
+  LineLocation CallSite = FunctionSamples::getCallSiteIdentifier(DIL);
+  for (auto &It : CallerNode->getAllChildContext()) {
+    ContextTrieNode &ChildNode = It.second;
+    if (ChildNode.getCallSiteLoc() != CallSite)
+      continue;
+    if (FunctionSamples *CalleeSamples = ChildNode.getFunctionSamples())
+      R.push_back(CalleeSamples);
+  }
+
+  return R;
+}
+
 FunctionSamples *
 SampleContextTracker::getContextSamplesFor(const DILocation *DIL) {
   assert(DIL && "Expect non-null location");
@@ -295,11 +319,6 @@ void SampleContextTracker::promoteMergeContextSamplesTree(
     const Instruction &Inst, StringRef CalleeName) {
   LLVM_DEBUG(dbgs() << "Promoting and merging context tree for instr: \n"
                     << Inst << "\n");
-  // CSFDO-TODO: We also need to promote context profile from indirect
-  // calls. We won't have callee names from those from call instr.
-  if (CalleeName.empty())
-    return;
-
   // Get the caller context for the call instruction, we don't use callee
   // name from call because there can be context from indirect calls too.
   DILocation *DIL = Inst.getDebugLoc();
@@ -309,6 +328,22 @@ void SampleContextTracker::promoteMergeContextSamplesTree(
 
   // Get the context that needs to be promoted
   LineLocation CallSite = FunctionSamples::getCallSiteIdentifier(DIL);
+  // For indirect call, CalleeName will be empty, in which case we need to
+  // promote all non-inlined child context profiles.
+  if (CalleeName.empty()) {
+    for (auto &It : CallerNode->getAllChildContext()) {
+      ContextTrieNode *NodeToPromo = &It.second;
+      if (CallSite != NodeToPromo->getCallSiteLoc())
+        continue;
+      FunctionSamples *FromSamples = NodeToPromo->getFunctionSamples();
+      if (FromSamples && FromSamples->getContext().hasState(InlinedContext))
+        continue;
+      promoteMergeContextSamplesTree(*NodeToPromo);
+    }
+    return;
+  }
+
+  // Get the context for the given callee that needs to be promoted
   ContextTrieNode *NodeToPromo =
       CallerNode->getChildContext(CallSite, CalleeName);
   if (!NodeToPromo)
@@ -328,6 +363,8 @@ ContextTrieNode &SampleContextTracker::promoteMergeContextSamplesTree(
   LLVM_DEBUG(dbgs() << "  Found context tree root to promote: "
                     << FromSamples->getContext() << "\n");
 
+  assert(!FromSamples->getContext().hasState(InlinedContext) &&
+         "Shouldn't promote inlined context profile");
   StringRef ContextStrToRemove = FromSamples->getContext().getCallingContext();
   return promoteMergeContextSamplesTree(NodeToPromo, RootContext,
                                         ContextStrToRemove);
@@ -360,14 +397,12 @@ SampleContextTracker::getCalleeContextFor(const DILocation *DIL,
                                           StringRef CalleeName) {
   assert(DIL && "Expect non-null location");
 
-  // CSSPGO-TODO: need to support indirect callee
-  if (CalleeName.empty())
-    return nullptr;
-
   ContextTrieNode *CallContext = getContextFor(DIL);
   if (!CallContext)
     return nullptr;
 
+  // When CalleeName is empty, the child context profile with max
+  // total samples will be returned.
   return CallContext->getChildContext(
       FunctionSamples::getCallSiteIdentifier(DIL), CalleeName);
 }
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 264ac4065e8c..665c4078f3ee 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -26,6 +26,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/None.h"
+#include "llvm/ADT/PriorityQueue.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
@@ -108,6 +109,14 @@ STATISTIC(NumMismatchedProfile,
           "Number of functions with CFG mismatched profile");
 STATISTIC(NumMatchedProfile, "Number of functions with CFG matched profile");
 
+STATISTIC(NumCSInlinedHitMinLimit,
+          "Number of functions with FDO inline stopped due to min size limit");
+STATISTIC(NumCSInlinedHitMaxLimit,
+          "Number of functions with FDO inline stopped due to max size limit");
+STATISTIC(
+    NumCSInlinedHitGrowthLimit,
+    "Number of functions with FDO inline stopped due to growth size limit");
+
 // Command line option to specify the file to read samples from. This is
 // mainly used for debugging.
 static cl::opt<std::string> SampleProfileFile(
@@ -171,6 +180,38 @@ static cl::opt<bool> ProfileSizeInline(
     cl::desc("Inline cold call sites in profile loader if it's beneficial "
              "for code size."));
 
+static cl::opt<int> ProfileInlineGrowthLimit(
+    "sample-profile-inline-growth-limit", cl::Hidden, cl::init(12),
+    cl::desc("The size growth ratio limit for proirity-based sample profile "
+             "loader inlining."));
+
+static cl::opt<int> ProfileInlineLimitMin(
+    "sample-profile-inline-limit-min", cl::Hidden, cl::init(100),
+    cl::desc("The lower bound of size growth limit for "
+             "proirity-based sample profile loader inlining."));
+
+static cl::opt<int> ProfileInlineLimitMax(
+    "sample-profile-inline-limit-max", cl::Hidden, cl::init(10000),
+    cl::desc("The upper bound of size growth limit for "
+             "proirity-based sample profile loader inlining."));
+
+static cl::opt<int> ProfileICPThreshold(
+    "sample-profile-icp-threshold", cl::Hidden, cl::init(5),
+    cl::desc(
+        "Relative hotness threshold for indirect "
+        "call promotion in proirity-based sample profile loader inlining."));
+
+static cl::opt<int> SampleHotCallSiteThreshold(
+    "sample-profile-hot-inline-threshold", cl::Hidden, cl::init(3000),
+    cl::desc("Hot callsite threshold for proirity-based sample profile loader "
+             "inlining."));
+
+static cl::opt<bool> CallsitePrioritizedInline(
+    "sample-profile-prioritized-inline", cl::Hidden, cl::ZeroOrMore,
+    cl::init(false),
+    cl::desc("Use call site prioritized inlining for sample profile loader."
+             "Currently only CSSPGO is supported."));
+
 static cl::opt<int> SampleColdCallSiteThreshold(
     "sample-profile-cold-inline-threshold", cl::Hidden, cl::init(45),
     cl::desc("Threshold for inlining cold callsites"));
@@ -313,6 +354,31 @@ class GUIDToFuncNameMapper {
   DenseMap<uint64_t, StringRef> &CurrentGUIDToFuncNameMap;
 };
 
+// Inline candidate used by iterative callsite prioritized inliner
+struct InlineCandidate {
+  CallBase *CallInstr;
+  const FunctionSamples *CalleeSamples;
+  uint64_t CallsiteCount;
+};
+
+// Inline candidate comparer using call site weight
+struct CandidateComparer {
+  bool operator()(const InlineCandidate &LHS, const InlineCandidate &RHS) {
+    if (LHS.CallsiteCount != RHS.CallsiteCount)
+      return LHS.CallsiteCount < RHS.CallsiteCount;
+
+    // Tie breaker using GUID so we have stable/deterministic inlining order
+    assert(LHS.CalleeSamples && RHS.CalleeSamples &&
+           "Expect non-null FunctionSamples");
+    return LHS.CalleeSamples->getGUID(LHS.CalleeSamples->getName()) <
+           RHS.CalleeSamples->getGUID(RHS.CalleeSamples->getName());
+  }
+};
+
+using CandidateQueue =
+    PriorityQueue<InlineCandidate, std::vector<InlineCandidate>,
+                  CandidateComparer>;
+
 /// Sample profile pass.
 ///
 /// This pass reads profile data from the file specified by
@@ -350,9 +416,23 @@ class SampleProfileLoader {
   findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const;
   mutable DenseMap<const DILocation *, const FunctionSamples *> DILocation2SampleMap;
   const FunctionSamples *findFunctionSamples(const Instruction &I) const;
-  bool inlineCallInstruction(CallBase &CB);
+  CallBase *tryPromoteIndirectCall(Function &F, StringRef CalleeName,
+                                   uint64_t &Sum, uint64_t Count, CallBase *I,
+                                   const char *&Reason);
+  bool inlineCallInstruction(CallBase &CB,
+                             const FunctionSamples *CalleeSamples);
   bool inlineHotFunctions(Function &F,
                           DenseSet<GlobalValue::GUID> &InlinedGUIDs);
+  // Helper functions call-site prioritized BFS inliner
+  // Will change the main FDO inliner to be work list based directly in
+  // upstream, then merge this change with that and remove the duplication.
+  InlineCost shouldInlineCandidate(InlineCandidate &Candidate);
+  bool getInlineCandidate(InlineCandidate *NewCandidate, CallBase *CB);
+  bool tryInlineCandidate(InlineCandidate &Candidate,
+                          SmallVector<CallBase *, 8> &InlinedCallSites);
+  bool
+  inlineHotFunctionsWithPriority(Function &F,
+                                 DenseSet<GlobalValue::GUID> &InlinedGUIDs);
   // Inline cold/small functions in addition to hot ones
   bool shouldInlineColdCallee(CallBase &CallInst);
   void emitOptimizationRemarksForInlineCandidates(
@@ -918,6 +998,31 @@ SampleProfileLoader::findIndirectCallFunctionSamples(
     return R;
   }
 
+  auto FSCompare = [](const FunctionSamples *L, const FunctionSamples *R) {
+    assert(L && R && "Expect non-null FunctionSamples");
+    if (L->getEntrySamples() != R->getEntrySamples())
+      return L->getEntrySamples() > R->getEntrySamples();
+    return FunctionSamples::getGUID(L->getName()) <
+           FunctionSamples::getGUID(R->getName());
+  };
+
+  if (ProfileIsCS) {
+    auto CalleeSamples =
+        ContextTracker->getIndirectCalleeContextSamplesFor(DIL);
+    if (CalleeSamples.empty())
+      return R;
+
+    // For CSSPGO, we only use target context profile's entry count
+    // as that already includes both inlined callee and non-inlined ones..
+    Sum = 0;
+    for (const auto *const FS : CalleeSamples) {
+      Sum += FS->getEntrySamples();
+      R.push_back(FS);
+    }
+    llvm::sort(R, FSCompare);
+    return R;
+  }
+
   const FunctionSamples *FS = findFunctionSamples(Inst);
   if (FS == nullptr)
     return R;
@@ -935,12 +1040,7 @@ SampleProfileLoader::findIndirectCallFunctionSamples(
       Sum += NameFS.second.getEntrySamples();
       R.push_back(&NameFS.second);
     }
-    llvm::sort(R, [](const FunctionSamples *L, const FunctionSamples *R) {
-      if (L->getEntrySamples() != R->getEntrySamples())
-        return L->getEntrySamples() > R->getEntrySamples();
-      return FunctionSamples::getGUID(L->getName()) <
-             FunctionSamples::getGUID(R->getName());
-    });
+    llvm::sort(R, FSCompare);
   }
   return R;
 }
@@ -977,7 +1077,32 @@ SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
   return it.first->second;
 }
 
-bool SampleProfileLoader::inlineCallInstruction(CallBase &CB) {
+CallBase *
+SampleProfileLoader::tryPromoteIndirectCall(Function &F, StringRef CalleeName,
+                                            uint64_t &Sum, uint64_t Count,
+                                            CallBase *I, const char *&Reason) {
+  Reason = "Callee function not available";
+  // R->getValue() != &F is to prevent promoting a recursive call.
+  // If it is a recursive call, we do not inline it as it could bloat
+  // the code exponentially. There is way to better handle this, e.g.
+  // clone the caller first, and inline the cloned caller if it is
+  // recursive. As llvm does not inline recursive calls, we will
+  // simply ignore it instead of handling it explicitly.
+  auto R = SymbolMap.find(CalleeName);
+  if (R != SymbolMap.end() && R->getValue() &&
+      !R->getValue()->isDeclaration() && R->getValue()->getSubprogram() &&
+      R->getValue()->hasFnAttribute("use-sample-profile") &&
+      R->getValue() != &F && isLegalToPromote(*I, R->getValue(), &Reason)) {
+    auto *DI =
+        &pgo::promoteIndirectCall(*I, R->getValue(), Count, Sum, false, ORE);
+    Sum -= Count;
+    return DI;
+  }
+  return nullptr;
+}
+
+bool SampleProfileLoader::inlineCallInstruction(
+    CallBase &CB, const FunctionSamples *CalleeSamples) {
   if (ExternalInlineAdvisor) {
     auto Advice = ExternalInlineAdvisor->getAdvice(CB);
     if (!Advice->isInliningRecommended()) {
@@ -1012,6 +1137,9 @@ bool SampleProfileLoader::inlineCallInstruction(CallBase &CB) {
     // The call to InlineFunction erases I, so we can't pass it here.
     emitInlinedInto(*ORE, DLoc, BB, *CalledFunction, *BB->getParent(), Cost,
                     true, CSINLINE_DEBUG);
+    if (ProfileIsCS)
+      ContextTracker->markContextSamplesInlined(CalleeSamples);
+    ++NumCSInlined;
     return true;
   }
   return false;
@@ -1129,34 +1257,17 @@ bool SampleProfileLoader::inlineHotFunctions(
           if (!callsiteIsHot(FS, PSI))
             continue;
 
-          const char *Reason = "Callee function not available";
-          // R->getValue() != &F is to prevent promoting a recursive call.
-          // If it is a recursive call, we do not inline it as it could bloat
-          // the code exponentially. There is way to better handle this, e.g.
-          // clone the caller first, and inline the cloned caller if it is
-          // recursive. As llvm does not inline recursive calls, we will
-          // simply ignore it instead of handling it explicitly.
+          const char *Reason = nullptr;
           auto CalleeFunctionName = FS->getFuncName();
-          auto R = SymbolMap.find(CalleeFunctionName);
-          if (R != SymbolMap.end() && R->getValue() &&
-              !R->getValue()->isDeclaration() &&
-              R->getValue()->getSubprogram() &&
-              R->getValue()->hasFnAttribute("use-sample-profile") &&
-              R->getValue() != &F &&
-              isLegalToPromote(*I, R->getValue(), &Reason)) {
-            uint64_t C = FS->getEntrySamples();
-            auto &DI =
-                pgo::promoteIndirectCall(*I, R->getValue(), C, Sum, false, ORE);
-            Sum -= C;
+          if (CallBase *DI =
+                  tryPromoteIndirectCall(F, CalleeFunctionName, Sum,
+                                         FS->getEntrySamples(), I, Reason)) {
             PromotedInsns.insert(I);
             // If profile mismatches, we should not attempt to inline DI.
             if ((isa<CallInst>(DI) || isa<InvokeInst>(DI)) &&
-                inlineCallInstruction(cast<CallBase>(DI))) {
-              if (ProfileIsCS)
-                ContextTracker->markContextSamplesInlined(FS);
+                inlineCallInstruction(cast<CallBase>(*DI), FS)) {
               localNotInlinedCallSites.erase(I);
               LocalChanged = true;
-              ++NumCSInlined;
             }
           } else {
             LLVM_DEBUG(dbgs()
@@ -1166,13 +1277,11 @@ bool SampleProfileLoader::inlineHotFunctions(
         }
       } else if (CalledFunction && CalledFunction->getSubprogram() &&
                  !CalledFunction->isDeclaration()) {
-        if (inlineCallInstruction(*I)) {
-          if (ProfileIsCS)
-            ContextTracker->markContextSamplesInlined(
-                localNotInlinedCallSites[I]);
+        if (inlineCallInstruction(*I, localNotInlinedCallSites.count(I)
+                                          ? localNotInlinedCallSites[I]
+                                          : nullptr)) {
           localNotInlinedCallSites.erase(I);
           LocalChanged = true;
-          ++NumCSInlined;
         }
       } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
         findCalleeFunctionSamples(*I)->findInlinedFunctions(
@@ -1186,6 +1295,11 @@ bool SampleProfileLoader::inlineHotFunctions(
     }
   }
 
+  // For CS profile, profile for not inlined context will be merged when
+  // base profile is being trieved
+  if (ProfileIsCS)
+    return Changed;
+
   // Accumulate not inlined callsite information into notInlinedSamples
   for (const auto &Pair : localNotInlinedCallSites) {
     CallBase *I = Pair.getFirst();
@@ -1232,6 +1346,254 @@ bool SampleProfileLoader::inlineHotFunctions(
   return Changed;
 }
 
+bool SampleProfileLoader::tryInlineCandidate(
+    InlineCandidate &Candidate, SmallVector<CallBase *, 8> &InlinedCallSites) {
+
+  CallBase &CB = *Candidate.CallInstr;
+  Function *CalledFunction = CB.getCalledFunction();
+  assert(CalledFunction && "Expect a callee with definition");
+  DebugLoc DLoc = CB.getDebugLoc();
+  BasicBlock *BB = CB.getParent();
+
+  InlineCost Cost = shouldInlineCandidate(Candidate);
+  if (Cost.isNever()) {
+    ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "InlineFail", DLoc, BB)
+              << "incompatible inlining");
+    return false;
+  }
+
+  if (!Cost)
+    return false;
+
+  InlineFunctionInfo IFI(nullptr, GetAC);
+  if (InlineFunction(CB, IFI).isSuccess()) {
+    // The call to InlineFunction erases I, so we can't pass it here.
+    emitInlinedInto(*ORE, DLoc, BB, *CalledFunction, *BB->getParent(), Cost,
+                    true, CSINLINE_DEBUG);
+
+    // Now populate the list of newly exposed call sites.
+    InlinedCallSites.clear();
+    for (auto &I : IFI.InlinedCallSites)
+      InlinedCallSites.push_back(I);
+
+    if (ProfileIsCS)
+      ContextTracker->markContextSamplesInlined(Candidate.CalleeSamples);
+    ++NumCSInlined;
+    return true;
+  }
+  return false;
+}
+
+bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate,
+                                             CallBase *CB) {
+  assert(CB && "Expect non-null call instruction");
+
+  if (isa<IntrinsicInst>(CB))
+    return false;
+
+  // Find the callee's profile. For indirect call, find hottest target profile.
+  const FunctionSamples *CalleeSamples = findCalleeFunctionSamples(*CB);
+  if (!CalleeSamples)
+    return false;
+
+  uint64_t CallsiteCount = 0;
+  ErrorOr<uint64_t> Weight = getBlockWeight(CB->getParent());
+  if (Weight)
+    CallsiteCount = Weight.get();
+  if (CalleeSamples)
+    CallsiteCount = std::max(CallsiteCount, CalleeSamples->getEntrySamples());
+
+  *NewCandidate = {CB, CalleeSamples, CallsiteCount};
+  return true;
+}
+
+InlineCost
+SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
+  assert(ProfileIsCS && "Prioritiy based inliner only works with CSSPGO now");
+
+  std::unique_ptr<InlineAdvice> Advice = nullptr;
+  if (ExternalInlineAdvisor) {
+    Advice = ExternalInlineAdvisor->getAdvice(*Candidate.CallInstr);
+    if (!Advice->isInliningRecommended()) {
+      Advice->recordUnattemptedInlining();
+      return InlineCost::getNever("not previously inlined");
+    }
+    Advice->recordInlining();
+    return InlineCost::getAlways("previously inlined");
+  }
+
+  // Adjust threshold based on call site hotness, only do this for callsite
+  // prioritized inliner because otherwise cost-benefit check is done earlier.
+  int SampleThreshold = SampleColdCallSiteThreshold;
+  if (CallsitePrioritizedInline) {
+    if (Candidate.CallsiteCount > PSI->getHotCountThreshold())
+      SampleThreshold = SampleHotCallSiteThreshold;
+    else if (!ProfileSizeInline)
+      return InlineCost::getNever("cold callsite");
+  }
+
+  Function *Callee = Candidate.CallInstr->getCalledFunction();
+  assert(Callee && "Expect a definition for inline candidate of direct call");
+
+  InlineParams Params = getInlineParams();
+  Params.ComputeFullInlineCost = true;
+  // Checks if there is anything in the reachable portion of the callee at
+  // this callsite that makes this inlining potentially illegal. Need to
+  // set ComputeFullInlineCost, otherwise getInlineCost may return early
+  // when cost exceeds threshold without checking all IRs in the callee.
+  // The acutal cost does not matter because we only checks isNever() to
+  // see if it is legal to inline the callsite.
+  InlineCost Cost = getInlineCost(*Candidate.CallInstr, Callee, Params,
+                                  GetTTI(*Callee), GetAC, GetTLI);
+
+  // For old FDO inliner, we inline the call site as long as cost is not
+  // "Never". The cost-benefit check is done earlier.
+  if (!CallsitePrioritizedInline) {
+    if (Cost.isNever())
+      return Cost;
+    return InlineCost::getAlways("hot callsite previously inlined");
+  }
+
+  // Honor always inline and never inline from call analyzer
+  if (Cost.isNever() || Cost.isAlways())
+    return Cost;
+
+  // Otherwise only use the cost from call analyzer, but overwite threshold with
+  // Sample PGO threshold.
+  return InlineCost::get(Cost.getCost(), SampleThreshold);
+}
+
+bool SampleProfileLoader::inlineHotFunctionsWithPriority(
+    Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
+  DenseSet<Instruction *> PromotedInsns;
+  assert(ProfileIsCS && "Prioritiy based inliner only works with CSSPGO now");
+
+  // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
+  // Profile symbol list is ignored when profile-sample-accurate is on.
+  assert((!ProfAccForSymsInList ||
+          (!ProfileSampleAccurate &&
+           !F.hasFnAttribute("profile-sample-accurate"))) &&
+         "ProfAccForSymsInList should be false when profile-sample-accurate "
+         "is enabled");
+
+  // Populating worklist with initial call sites from root inliner, along
+  // with call site weights.
+  CandidateQueue CQueue;
+  InlineCandidate NewCandidate;
+  for (auto &BB : F) {
+    for (auto &I : BB.getInstList()) {
+      auto *CB = dyn_cast<CallBase>(&I);
+      if (!CB)
+        continue;
+      if (getInlineCandidate(&NewCandidate, CB))
+        CQueue.push(NewCandidate);
+    }
+  }
+
+  // Cap the size growth from profile guided inlining. This is needed even
+  // though cost of each inline candidate already accounts for callee size,
+  // because with top-down inlining, we can grow inliner size significantly
+  // with large number of smaller inlinees each pass the cost check.
+  assert(ProfileInlineLimitMax >= ProfileInlineLimitMin &&
+         "Max inline size limit should not be smaller than min inline size "
+         "limit.");
+  unsigned SizeLimit = F.getInstructionCount() * ProfileInlineGrowthLimit;
+  SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
+  SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
+  if (ExternalInlineAdvisor)
+    SizeLimit = std::numeric_limits<unsigned>::max();
+
+  // Perform iterative BFS call site prioritized inlining
+  bool Changed = false;
+  while (!CQueue.empty() && F.getInstructionCount() < SizeLimit) {
+    InlineCandidate Candidate = CQueue.top();
+    CQueue.pop();
+    CallBase *I = Candidate.CallInstr;
+    Function *CalledFunction = I->getCalledFunction();
+
+    if (CalledFunction == &F)
+      continue;
+    if (I->isIndirectCall()) {
+      if (PromotedInsns.count(I))
+        continue;
+      uint64_t Sum;
+      auto CalleeSamples = findIndirectCallFunctionSamples(*I, Sum);
+      uint64_t SumOrigin = Sum;
+      for (const auto *FS : CalleeSamples) {
+        // TODO: Consider disable pre-lTO ICP for MonoLTO as well
+        if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
+          FS->findInlinedFunctions(InlinedGUIDs, F.getParent(),
+                                   PSI->getOrCompHotCountThreshold());
+          continue;
+        }
+        uint64_t EntryCountDistributed = FS->getEntrySamples();
+        // In addition to regular inline cost check, we also need to make sure
+        // ICP isn't introducing excessive speculative checks even if individual
+        // target looks beneficial to promote and inline. That means we should
+        // only do ICP when there's a small number dominant targets.
+        if (EntryCountDistributed < SumOrigin / ProfileICPThreshold)
+          break;
+        // TODO: Fix CallAnalyzer to handle all indirect calls.
+        // For indirect call, we don't run CallAnalyzer to get InlineCost
+        // before actual inlining. This is because we could see two different
+        // types from the same definition, which makes CallAnalyzer choke as
+        // it's expecting matching parameter type on both caller and callee
+        // side. See example from PR18962 for the triggering cases (the bug was
+        // fixed, but we generate different types).
+        if (!PSI->isHotCount(EntryCountDistributed))
+          break;
+        const char *Reason = nullptr;
+        auto CalleeFunctionName = FS->getFuncName();
+        if (CallBase *DI = tryPromoteIndirectCall(
+                F, CalleeFunctionName, Sum, EntryCountDistributed, I, Reason)) {
+          // Attach function profile for promoted indirect callee, and update
+          // call site count for the promoted inline candidate too.
+          Candidate = {DI, FS, EntryCountDistributed};
+          PromotedInsns.insert(I);
+          SmallVector<CallBase *, 8> InlinedCallSites;
+          // If profile mismatches, we should not attempt to inline DI.
+          if ((isa<CallInst>(DI) || isa<InvokeInst>(DI)) &&
+              tryInlineCandidate(Candidate, InlinedCallSites)) {
+            for (auto *CB : InlinedCallSites) {
+              if (getInlineCandidate(&NewCandidate, CB))
+                CQueue.emplace(NewCandidate);
+            }
+            Changed = true;
+          }
+        } else {
+          LLVM_DEBUG(dbgs()
+                     << "\nFailed to promote indirect call to "
+                     << CalleeFunctionName << " because " << Reason << "\n");
+        }
+      }
+    } else if (CalledFunction && CalledFunction->getSubprogram() &&
+               !CalledFunction->isDeclaration()) {
+      SmallVector<CallBase *, 8> InlinedCallSites;
+      if (tryInlineCandidate(Candidate, InlinedCallSites)) {
+        for (auto *CB : InlinedCallSites) {
+          if (getInlineCandidate(&NewCandidate, CB))
+            CQueue.emplace(NewCandidate);
+        }
+        Changed = true;
+      }
+    } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
+      findCalleeFunctionSamples(*I)->findInlinedFunctions(
+          InlinedGUIDs, F.getParent(), PSI->getOrCompHotCountThreshold());
+    }
+  }
+
+  if (!CQueue.empty()) {
+    if (SizeLimit == (unsigned)ProfileInlineLimitMax)
+      ++NumCSInlinedHitMaxLimit;
+    else if (SizeLimit == (unsigned)ProfileInlineLimitMin)
+      ++NumCSInlinedHitMinLimit;
+    else
+      ++NumCSInlinedHitGrowthLimit;
+  }
+
+  return Changed;
+}
+
 /// Find equivalence classes for the given block.
 ///
 /// This finds all the blocks that are guaranteed to execute the same
@@ -1833,7 +2195,10 @@ bool SampleProfileLoader::emitAnnotations(Function &F) {
   }
 
   DenseSet<GlobalValue::GUID> InlinedGUIDs;
-  Changed |= inlineHotFunctions(F, InlinedGUIDs);
+  if (ProfileIsCS && CallsitePrioritizedInline)
+    Changed |= inlineHotFunctionsWithPriority(F, InlinedGUIDs);
+  else
+    Changed |= inlineHotFunctions(F, InlinedGUIDs);
 
   // Compute basic block weights.
   Changed |= computeBlockWeights(F);
@@ -1978,6 +2343,12 @@ bool SampleProfileLoader::doInitialization(Module &M,
     ProfileIsCS = true;
     FunctionSamples::ProfileIsCS = true;
 
+    // Enable priority-base inliner and size inline by default for CSSPGO.
+    if (!ProfileSizeInline.getNumOccurrences())
+      ProfileSizeInline = true;
+    if (!CallsitePrioritizedInline.getNumOccurrences())
+      CallsitePrioritizedInline = true;
+
     // Tracker for profiles under different context
     ContextTracker =
         std::make_unique<SampleContextTracker>(Reader->getProfiles());
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/indirect-call-csspgo.prof b/llvm/test/Transforms/SampleProfile/Inputs/indirect-call-csspgo.prof
new file mode 100644
index 000000000000..095c7a1fc480
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/Inputs/indirect-call-csspgo.prof
@@ -0,0 +1,10 @@
+[test]:63067:0
+ 1: 3345 _Z3barv:1398 _Z3foov:2059
+ 2: 100 _Z3bazv:102
+ 3: 100 _Z3zoov:102
+[test:1 @ _Z3barv]:200:100
+ 1: 100
+[test:1 @ _Z3foov]:4220:1200
+ 14: 4220
+[test:2 @ _Z3bazv]:200:100
+ 5: 100
\ No newline at end of file
diff --git a/llvm/test/Transforms/SampleProfile/csspgo-inline-debug.ll b/llvm/test/Transforms/SampleProfile/csspgo-inline-debug.ll
new file mode 100644
index 000000000000..e5f2f7571eaf
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/csspgo-inline-debug.ll
@@ -0,0 +1,166 @@
+; REQUIRES: asserts
+; Test that the new FDO inliner using prioty queue will not visit same call site again and again.
+; Use debug prints as repeated call site evaluation is not visible from final inline decision.
+
+; Note that we need new pass manager to enable top-down processing for sample profile loader
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-prioritized-inline=0 -debug-only=sample-context-tracker -o /dev/null 2>&1 | FileCheck %s --check-prefix=OLD-INLINE
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-prioritized-inline=1 -debug-only=sample-context-tracker -o /dev/null 2>&1 | FileCheck %s --check-prefix=NEW-INLINE
+
+; Old inliner will evaluate the same call site three times
+; OLD-INLINE:      Getting callee context for instr:   %call = tail call i32 @_Z5funcBi
+; OLD-INLINE-NEXT:   Callee context found: main:3.1 @ _Z5funcBi
+; OLD-INLINE:      Getting callee context for instr:   %call = tail call i32 @_Z5funcBi
+; OLD-INLINE-NEXT:   Callee context found: main:3.1 @ _Z5funcBi
+; OLD-INLINE:      Getting callee context for instr:   %call = tail call i32 @_Z5funcBi
+; OLD-INLINE-NEXT:   Callee context found: main:3.1 @ _Z5funcBi
+
+; New inliner only evaluate the same call site once
+; NEW-INLINE:      Getting callee context for instr:   %call = tail call i32 @_Z5funcBi
+; NEW-INLINE-NEXT:   Callee context found: main:3.1 @ _Z5funcBi
+; NEW-INLINE-NOT:  Getting callee context for instr:   %call = tail call i32 @_Z5funcBi
+; NEW-INLINE-NOT:    Callee context found: main:3.1 @ _Z5funcBi
+
+@factor = dso_local global i32 3, align 4, !dbg !0
+
+define dso_local i32 @main() local_unnamed_addr #0 !dbg !18 {
+entry:
+  br label %for.body, !dbg !25
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret i32 %add3, !dbg !27
+
+for.body:                                         ; preds = %for.body, %entry
+  %x.011 = phi i32 [ 300000, %entry ], [ %dec, %for.body ]
+  %r.010 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
+  %call = tail call i32 @_Z5funcBi(i32 %x.011), !dbg !32
+  %add = add nuw nsw i32 %x.011, 1, !dbg !31
+  %call1 = tail call i32 @_Z5funcAi(i32 %add), !dbg !28
+  %add2 = add i32 %call, %r.010, !dbg !34
+  %add3 = add i32 %add2, %call1, !dbg !35
+  %dec = add nsw i32 %x.011, -1, !dbg !36
+  %cmp = icmp eq i32 %x.011, 0, !dbg !38
+  br i1 %cmp, label %for.cond.cleanup, label %for.body, !dbg !25
+}
+
+define dso_local i32 @_Z5funcAi(i32 %x) local_unnamed_addr #1 !dbg !40 {
+entry:
+  %add = add nsw i32 %x, 100000, !dbg !44
+  %call = tail call i32 @_Z8funcLeafi(i32 %add), !dbg !45
+  ret i32 %call, !dbg !46
+}
+
+define dso_local i32 @_Z8funcLeafi(i32 %x) local_unnamed_addr #1 !dbg !54 {
+entry:
+  %cmp = icmp sgt i32 %x, 0, !dbg !57
+  br i1 %cmp, label %while.body, label %while.cond2.preheader, !dbg !59
+
+while.cond2.preheader:                            ; preds = %entry
+  %cmp313 = icmp slt i32 %x, 0, !dbg !60
+  br i1 %cmp313, label %while.body4, label %if.end, !dbg !63
+
+while.body:                                       ; preds = %while.body, %entry
+  %x.addr.016 = phi i32 [ %sub, %while.body ], [ %x, %entry ]
+  %tmp = load volatile i32, i32* @factor, align 4, !dbg !64
+  %call = tail call i32 @_Z3fibi(i32 %tmp), !dbg !67
+  %sub = sub nsw i32 %x.addr.016, %call, !dbg !68
+  %cmp1 = icmp sgt i32 %sub, 0, !dbg !69
+  br i1 %cmp1, label %while.body, label %if.end, !dbg !71
+
+while.body4:                                      ; preds = %while.body4, %while.cond2.preheader
+  %x.addr.114 = phi i32 [ %add, %while.body4 ], [ %x, %while.cond2.preheader ]
+  %tmp1 = load volatile i32, i32* @factor, align 4, !dbg !72
+  %call5 = tail call i32 @_Z3fibi(i32 %tmp1), !dbg !74
+  %add = add nsw i32 %call5, %x.addr.114, !dbg !75
+  %cmp3 = icmp slt i32 %add, 0, !dbg !60
+  br i1 %cmp3, label %while.body4, label %if.end, !dbg !63
+
+if.end:                                           ; preds = %while.body4, %while.body, %while.cond2.preheader
+  %x.addr.2 = phi i32 [ 0, %while.cond2.preheader ], [ %sub, %while.body ], [ %add, %while.body4 ]
+  ret i32 %x.addr.2, !dbg !76
+}
+
+define dso_local i32 @_Z5funcBi(i32 %x) local_unnamed_addr #0 !dbg !47 {
+entry:
+  %sub = add nsw i32 %x, -100000, !dbg !51
+  %call = tail call i32 @_Z8funcLeafi(i32 %sub), !dbg !52
+  ret i32 %call, !dbg !53
+}
+
+declare i32 @_Z3fibi(i32)
+
+attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #1 = { nofree norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!14, !15, !16}
+!llvm.ident = !{!17}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "factor", scope: !2, file: !3, line: 21, type: !13, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !5, globals: !12, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!3 = !DIFile(filename: "merged.cpp", directory: "/local/autofdo")
+!4 = !{}
+!5 = !{!6, !10, !11}
+!6 = !DISubprogram(name: "funcA", linkageName: "_Z5funcAi", scope: !3, file: !3, line: 6, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
+!7 = !DISubroutineType(types: !8)
+!8 = !{!9, !9}
+!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!10 = !DISubprogram(name: "funcB", linkageName: "_Z5funcBi", scope: !3, file: !3, line: 7, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
+!11 = !DISubprogram(name: "funcLeaf", linkageName: "_Z8funcLeafi", scope: !3, file: !3, line: 22, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
+!12 = !{!0}
+!13 = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: !9)
+!14 = !{i32 7, !"Dwarf Version", i32 4}
+!15 = !{i32 2, !"Debug Info Version", i32 3}
+!16 = !{i32 1, !"wchar_size", i32 4}
+!17 = !{!"clang version 11.0.0"}
+!18 = distinct !DISubprogram(name: "main", scope: !3, file: !3, line: 11, type: !19, scopeLine: 11, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !21)
+!19 = !DISubroutineType(types: !20)
+!20 = !{!9}
+!21 = !{!22, !23}
+!22 = !DILocalVariable(name: "r", scope: !18, file: !3, line: 12, type: !9)
+!23 = !DILocalVariable(name: "x", scope: !24, file: !3, line: 13, type: !9)
+!24 = distinct !DILexicalBlock(scope: !18, file: !3, line: 13, column: 3)
+!25 = !DILocation(line: 13, column: 3, scope: !26)
+!26 = !DILexicalBlockFile(scope: !24, file: !3, discriminator: 2)
+!27 = !DILocation(line: 17, column: 3, scope: !18)
+!28 = !DILocation(line: 14, column: 10, scope: !29)
+!29 = distinct !DILexicalBlock(scope: !30, file: !3, line: 13, column: 37)
+!30 = distinct !DILexicalBlock(scope: !24, file: !3, line: 13, column: 3)
+!31 = !DILocation(line: 14, column: 29, scope: !29)
+!32 = !DILocation(line: 14, column: 21, scope: !33)
+!33 = !DILexicalBlockFile(scope: !29, file: !3, discriminator: 2)
+!34 = !DILocation(line: 14, column: 19, scope: !29)
+!35 = !DILocation(line: 14, column: 7, scope: !29)
+!36 = !DILocation(line: 13, column: 33, scope: !37)
+!37 = !DILexicalBlockFile(scope: !30, file: !3, discriminator: 6)
+!38 = !DILocation(line: 13, column: 26, scope: !39)
+!39 = !DILexicalBlockFile(scope: !30, file: !3, discriminator: 2)
+!40 = distinct !DISubprogram(name: "funcA", linkageName: "_Z5funcAi", scope: !3, file: !3, line: 26, type: !7, scopeLine: 26, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!44 = !DILocation(line: 27, column: 22, scope: !40)
+!45 = !DILocation(line: 27, column: 11, scope: !40)
+!46 = !DILocation(line: 29, column: 3, scope: !40)
+!47 = distinct !DISubprogram(name: "funcB", linkageName: "_Z5funcBi", scope: !3, file: !3, line: 32, type: !7, scopeLine: 32, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!51 = !DILocation(line: 33, column: 22, scope: !47)
+!52 = !DILocation(line: 33, column: 11, scope: !47)
+!53 = !DILocation(line: 35, column: 3, scope: !47)
+!54 = distinct !DISubprogram(name: "funcLeaf", linkageName: "_Z8funcLeafi", scope: !3, file: !3, line: 48, type: !7, scopeLine: 48, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!57 = !DILocation(line: 49, column: 9, scope: !58)
+!58 = distinct !DILexicalBlock(scope: !54, file: !3, line: 49, column: 7)
+!59 = !DILocation(line: 49, column: 7, scope: !54)
+!60 = !DILocation(line: 58, column: 14, scope: !61)
+!61 = !DILexicalBlockFile(scope: !62, file: !3, discriminator: 2)
+!62 = distinct !DILexicalBlock(scope: !58, file: !3, line: 56, column: 8)
+!63 = !DILocation(line: 58, column: 5, scope: !61)
+!64 = !DILocation(line: 52, column: 16, scope: !65)
+!65 = distinct !DILexicalBlock(scope: !66, file: !3, line: 51, column: 19)
+!66 = distinct !DILexicalBlock(scope: !58, file: !3, line: 49, column: 14)
+!67 = !DILocation(line: 52, column: 12, scope: !65)
+!68 = !DILocation(line: 52, column: 9, scope: !65)
+!69 = !DILocation(line: 51, column: 14, scope: !70)
+!70 = !DILexicalBlockFile(scope: !66, file: !3, discriminator: 2)
+!71 = !DILocation(line: 51, column: 5, scope: !70)
+!72 = !DILocation(line: 59, column: 16, scope: !73)
+!73 = distinct !DILexicalBlock(scope: !62, file: !3, line: 58, column: 19)
+!74 = !DILocation(line: 59, column: 12, scope: !73)
+!75 = !DILocation(line: 59, column: 9, scope: !73)
+!76 = !DILocation(line: 63, column: 3, scope: !54)
diff --git a/llvm/test/Transforms/SampleProfile/csspgo-inline-icall.ll b/llvm/test/Transforms/SampleProfile/csspgo-inline-icall.ll
new file mode 100644
index 000000000000..3ec64326da2d
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/csspgo-inline-icall.ll
@@ -0,0 +1,63 @@
+; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/indirect-call-csspgo.prof -sample-profile-icp-threshold=100 -pass-remarks=sample-profile -S -o /dev/null 2>&1 | FileCheck -check-prefix=ICP-ALL %s
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/indirect-call-csspgo.prof -sample-profile-icp-threshold=100 -pass-remarks=sample-profile -S -o /dev/null 2>&1 | FileCheck -check-prefix=ICP-ALL %s
+; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/indirect-call-csspgo.prof -sample-profile-icp-threshold=100 -pass-remarks=sample-profile -sample-profile-inline-size=0 -S -o /dev/null 2>&1 | FileCheck -check-prefix=ICP-HOT %s
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/indirect-call-csspgo.prof -sample-profile-icp-threshold=100 -pass-remarks=sample-profile -sample-profile-inline-size=0 -S -o /dev/null 2>&1 | FileCheck -check-prefix=ICP-HOT %s
+
+define void @test(void ()*) #0 !dbg !3 {
+;; Add two direct call to force top-down order for sample profile loader
+  call void @_Z3foov(), !dbg !7
+  call void @_Z3barv(), !dbg !7
+  call void @_Z3bazv(), !dbg !7
+  %2 = alloca void ()*
+  store void ()* %0, void ()** %2
+  %3 = load void ()*, void ()** %2
+  call void %3(), !dbg !4
+  %4 = alloca void ()*
+  store void ()* %0, void ()** %4
+  %5 = load void ()*, void ()** %4
+  call void %5(), !dbg !5
+  ret void
+}
+
+define void @_Z3foov() #0 !dbg !8 {
+  ret void
+}
+
+define void @_Z3barv() #0 !dbg !9 {
+  ret void
+}
+
+define void @_Z3bazv() #0 !dbg !10 {
+  ret void
+}
+
+define void @_Z3zoov() #0 !dbg !11 {
+  ret void
+}
+
+attributes #0 = {"use-sample-profile"}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1)
+!1 = !DIFile(filename: "test.cc", directory: "/")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 3, unit: !0)
+!4 = !DILocation(line: 4, scope: !3)
+!5 = !DILocation(line: 5, scope: !3)
+!6 = !DILocation(line: 6, scope: !3)
+!7 = !DILocation(line: 7, scope: !3)
+!8 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 29, unit: !0)
+!9 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !1, file: !1, line: 32, unit: !0)
+!10 = distinct !DISubprogram(name: "baz", linkageName: "_Z3bazv", scope: !1, file: !1, line: 24, unit: !0)
+!11 = distinct !DISubprogram(name: "zoo", linkageName: "_Z3zoov", scope: !1, file: !1, line: 24, unit: !0)
+
+
+; ICP-ALL: remark: test.cc:5:0: _Z3bazv inlined into test
+; ICP-ALL-NEXT: remark: test.cc:4:0: _Z3foov inlined into test
+; ICP-ALL-NEXT: remark: test.cc:4:0: _Z3barv inlined into test
+; ICP-ALL-NOT: remark
+
+; ICP-HOT: remark: test.cc:4:0: _Z3foov inlined into test
+; ICP-HOT-NOT: remark
diff --git a/llvm/test/Transforms/SampleProfile/csspgo-inline.ll b/llvm/test/Transforms/SampleProfile/csspgo-inline.ll
new file mode 100644
index 000000000000..14e916d8c2e8
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/csspgo-inline.ll
@@ -0,0 +1,180 @@
+; Test for CSSPGO's new early inliner using priority queue
+
+; Note that we need new pass manager to enable top-down processing for sample profile loader
+; Test we inlined the following in top-down order with old inliner
+;   main:3 @ _Z5funcAi
+;   main:3 @ _Z5funcAi:1 @ _Z8funcLeafi
+;   _Z5funcBi:1 @ _Z8funcLeafi
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-prioritized-inline=0 -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-BASE
+;
+; With new FDO early inliner, callee entry count is used to drive inlining instead of callee total samples, so we get less inlining for given profile
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-NEW
+;
+; With new FDO early inliner, callee entry count is used to drive inlining instead of callee total samples, tuning hot cutoff can get us the same inlining
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-summary-cutoff-hot=999900 -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-BASE
+;
+; With new FDO early inliner, callee entry count is used to drive inlining instead of callee total samples, tuning cold sample profile inline threshold can get us the same inlining
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-cold-inline-threshold=200 -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-BASE
+;
+; With new FDO early inliner and tuned cutoff, we can control inlining through size growth tuning knob.
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-summary-cutoff-hot=999900 -sample-profile-inline-limit-min=0 -sample-profile-inline-growth-limit=1 -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --allow-empty --check-prefix=INLINE-NEW-LIMIT1
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-summary-cutoff-hot=999900 -sample-profile-inline-limit-min=10 -sample-profile-inline-growth-limit=1 -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-NEW-LIMIT2
+
+
+; INLINE-BASE: remark: merged.cpp:14:10: _Z5funcAi inlined into main to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite main:3:10
+; INLINE-BASE: remark: merged.cpp:27:11: _Z8funcLeafi inlined into main to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite _Z5funcAi:1:11 @ main:3:10
+; INLINE-BASE: remark: merged.cpp:33:11: _Z8funcLeafi inlined into _Z5funcBi to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite _Z5funcBi:1:11
+
+; INLINE-NEW: remark: merged.cpp:14:10: _Z5funcAi inlined into main to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite main:3:10
+; INLINE-NEW-NOT: remark
+
+; INLINE-NEW-LIMIT1-NOT: remark
+
+; INLINE-NEW-LIMIT2: remark: merged.cpp:27:11: _Z8funcLeafi inlined into _Z5funcAi to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite _Z5funcAi:1:11
+; INLINE-NEW-LIMIT2: remark: merged.cpp:33:11: _Z8funcLeafi inlined into _Z5funcBi to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite _Z5funcBi:1:11
+; INLINE-NEW-LIMIT2-NOT: remark
+
+@factor = dso_local global i32 3, align 4, !dbg !0
+
+define dso_local i32 @main() local_unnamed_addr #0 !dbg !18 {
+entry:
+  br label %for.body, !dbg !25
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret i32 %add3, !dbg !27
+
+for.body:                                         ; preds = %for.body, %entry
+  %x.011 = phi i32 [ 300000, %entry ], [ %dec, %for.body ]
+  %r.010 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
+  %call = tail call i32 @_Z5funcBi(i32 %x.011), !dbg !32
+  %add = add nuw nsw i32 %x.011, 1, !dbg !31
+  %call1 = tail call i32 @_Z5funcAi(i32 %add), !dbg !28
+  %add2 = add i32 %call, %r.010, !dbg !34
+  %add3 = add i32 %add2, %call1, !dbg !35
+  %dec = add nsw i32 %x.011, -1, !dbg !36
+  %cmp = icmp eq i32 %x.011, 0, !dbg !38
+  br i1 %cmp, label %for.cond.cleanup, label %for.body, !dbg !25
+}
+
+define dso_local i32 @_Z5funcAi(i32 %x) local_unnamed_addr #1 !dbg !40 {
+entry:
+  %add = add nsw i32 %x, 100000, !dbg !44
+  %call = tail call i32 @_Z8funcLeafi(i32 %add), !dbg !45
+  ret i32 %call, !dbg !46
+}
+
+define dso_local i32 @_Z8funcLeafi(i32 %x) local_unnamed_addr #1 !dbg !54 {
+entry:
+  %cmp = icmp sgt i32 %x, 0, !dbg !57
+  br i1 %cmp, label %while.body, label %while.cond2.preheader, !dbg !59
+
+while.cond2.preheader:                            ; preds = %entry
+  %cmp313 = icmp slt i32 %x, 0, !dbg !60
+  br i1 %cmp313, label %while.body4, label %if.end, !dbg !63
+
+while.body:                                       ; preds = %while.body, %entry
+  %x.addr.016 = phi i32 [ %sub, %while.body ], [ %x, %entry ]
+  %tmp = load volatile i32, i32* @factor, align 4, !dbg !64
+  %call = tail call i32 @_Z3fibi(i32 %tmp), !dbg !67
+  %sub = sub nsw i32 %x.addr.016, %call, !dbg !68
+  %cmp1 = icmp sgt i32 %sub, 0, !dbg !69
+  br i1 %cmp1, label %while.body, label %if.end, !dbg !71
+
+while.body4:                                      ; preds = %while.body4, %while.cond2.preheader
+  %x.addr.114 = phi i32 [ %add, %while.body4 ], [ %x, %while.cond2.preheader ]
+  %tmp1 = load volatile i32, i32* @factor, align 4, !dbg !72
+  %call5 = tail call i32 @_Z3fibi(i32 %tmp1), !dbg !74
+  %add = add nsw i32 %call5, %x.addr.114, !dbg !75
+  %cmp3 = icmp slt i32 %add, 0, !dbg !60
+  br i1 %cmp3, label %while.body4, label %if.end, !dbg !63
+
+if.end:                                           ; preds = %while.body4, %while.body, %while.cond2.preheader
+  %x.addr.2 = phi i32 [ 0, %while.cond2.preheader ], [ %sub, %while.body ], [ %add, %while.body4 ]
+  ret i32 %x.addr.2, !dbg !76
+}
+
+define dso_local i32 @_Z5funcBi(i32 %x) local_unnamed_addr #0 !dbg !47 {
+entry:
+  %sub = add nsw i32 %x, -100000, !dbg !51
+  %call = tail call i32 @_Z8funcLeafi(i32 %sub), !dbg !52
+  ret i32 %call, !dbg !53
+}
+
+declare i32 @_Z3fibi(i32)
+
+attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #1 = { nofree norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!14, !15, !16}
+!llvm.ident = !{!17}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "factor", scope: !2, file: !3, line: 21, type: !13, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !5, globals: !12, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!3 = !DIFile(filename: "merged.cpp", directory: "/local/autofdo")
+!4 = !{}
+!5 = !{!6, !10, !11}
+!6 = !DISubprogram(name: "funcA", linkageName: "_Z5funcAi", scope: !3, file: !3, line: 6, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
+!7 = !DISubroutineType(types: !8)
+!8 = !{!9, !9}
+!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!10 = !DISubprogram(name: "funcB", linkageName: "_Z5funcBi", scope: !3, file: !3, line: 7, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
+!11 = !DISubprogram(name: "funcLeaf", linkageName: "_Z8funcLeafi", scope: !3, file: !3, line: 22, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
+!12 = !{!0}
+!13 = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: !9)
+!14 = !{i32 7, !"Dwarf Version", i32 4}
+!15 = !{i32 2, !"Debug Info Version", i32 3}
+!16 = !{i32 1, !"wchar_size", i32 4}
+!17 = !{!"clang version 11.0.0"}
+!18 = distinct !DISubprogram(name: "main", scope: !3, file: !3, line: 11, type: !19, scopeLine: 11, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !21)
+!19 = !DISubroutineType(types: !20)
+!20 = !{!9}
+!21 = !{!22, !23}
+!22 = !DILocalVariable(name: "r", scope: !18, file: !3, line: 12, type: !9)
+!23 = !DILocalVariable(name: "x", scope: !24, file: !3, line: 13, type: !9)
+!24 = distinct !DILexicalBlock(scope: !18, file: !3, line: 13, column: 3)
+!25 = !DILocation(line: 13, column: 3, scope: !26)
+!26 = !DILexicalBlockFile(scope: !24, file: !3, discriminator: 2)
+!27 = !DILocation(line: 17, column: 3, scope: !18)
+!28 = !DILocation(line: 14, column: 10, scope: !29)
+!29 = distinct !DILexicalBlock(scope: !30, file: !3, line: 13, column: 37)
+!30 = distinct !DILexicalBlock(scope: !24, file: !3, line: 13, column: 3)
+!31 = !DILocation(line: 14, column: 29, scope: !29)
+!32 = !DILocation(line: 14, column: 21, scope: !33)
+!33 = !DILexicalBlockFile(scope: !29, file: !3, discriminator: 2)
+!34 = !DILocation(line: 14, column: 19, scope: !29)
+!35 = !DILocation(line: 14, column: 7, scope: !29)
+!36 = !DILocation(line: 13, column: 33, scope: !37)
+!37 = !DILexicalBlockFile(scope: !30, file: !3, discriminator: 6)
+!38 = !DILocation(line: 13, column: 26, scope: !39)
+!39 = !DILexicalBlockFile(scope: !30, file: !3, discriminator: 2)
+!40 = distinct !DISubprogram(name: "funcA", linkageName: "_Z5funcAi", scope: !3, file: !3, line: 26, type: !7, scopeLine: 26, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!44 = !DILocation(line: 27, column: 22, scope: !40)
+!45 = !DILocation(line: 27, column: 11, scope: !40)
+!46 = !DILocation(line: 29, column: 3, scope: !40)
+!47 = distinct !DISubprogram(name: "funcB", linkageName: "_Z5funcBi", scope: !3, file: !3, line: 32, type: !7, scopeLine: 32, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!51 = !DILocation(line: 33, column: 22, scope: !47)
+!52 = !DILocation(line: 33, column: 11, scope: !47)
+!53 = !DILocation(line: 35, column: 3, scope: !47)
+!54 = distinct !DISubprogram(name: "funcLeaf", linkageName: "_Z8funcLeafi", scope: !3, file: !3, line: 48, type: !7, scopeLine: 48, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!57 = !DILocation(line: 49, column: 9, scope: !58)
+!58 = distinct !DILexicalBlock(scope: !54, file: !3, line: 49, column: 7)
+!59 = !DILocation(line: 49, column: 7, scope: !54)
+!60 = !DILocation(line: 58, column: 14, scope: !61)
+!61 = !DILexicalBlockFile(scope: !62, file: !3, discriminator: 2)
+!62 = distinct !DILexicalBlock(scope: !58, file: !3, line: 56, column: 8)
+!63 = !DILocation(line: 58, column: 5, scope: !61)
+!64 = !DILocation(line: 52, column: 16, scope: !65)
+!65 = distinct !DILexicalBlock(scope: !66, file: !3, line: 51, column: 19)
+!66 = distinct !DILexicalBlock(scope: !58, file: !3, line: 49, column: 14)
+!67 = !DILocation(line: 52, column: 12, scope: !65)
+!68 = !DILocation(line: 52, column: 9, scope: !65)
+!69 = !DILocation(line: 51, column: 14, scope: !70)
+!70 = !DILexicalBlockFile(scope: !66, file: !3, discriminator: 2)
+!71 = !DILocation(line: 51, column: 5, scope: !70)
+!72 = !DILocation(line: 59, column: 16, scope: !73)
+!73 = distinct !DILexicalBlock(scope: !62, file: !3, line: 58, column: 19)
+!74 = !DILocation(line: 59, column: 12, scope: !73)
+!75 = !DILocation(line: 59, column: 9, scope: !73)
+!76 = !DILocation(line: 63, column: 3, scope: !54)
diff --git a/llvm/test/Transforms/SampleProfile/profile-context-tracker-debug.ll b/llvm/test/Transforms/SampleProfile/profile-context-tracker-debug.ll
index 1a7a53457a5d..7789e18b394a 100644
--- a/llvm/test/Transforms/SampleProfile/profile-context-tracker-debug.ll
+++ b/llvm/test/Transforms/SampleProfile/profile-context-tracker-debug.ll
@@ -3,11 +3,11 @@
 ; based on inline decision, so post inline counts are accurate.
 
 ; Note that we need new pass manager to enable top-down processing for sample profile loader
-; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -debug-only=sample-context-tracker -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-ALL
-; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -debug-only=sample-context-tracker -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-HOT
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-cold-inline-threshold=200 -debug-only=sample-context-tracker -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-ALL
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-prioritized-inline=0 -sample-profile-inline-size=0 -debug-only=sample-context-tracker -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-HOT
 
 
-; Testwe we inlined the following in top-down order and promot rest not inlined context profile into base profile
+; Test we inlined the following in top-down order and promot rest not inlined context profile into base profile
 ;   main:3 @ _Z5funcAi
 ;   main:3 @ _Z5funcAi:1 @ _Z8funcLeafi
 ;   _Z5funcBi:1 @ _Z8funcLeafi
@@ -20,13 +20,9 @@
 ; INLINE-ALL-NEXT: Getting callee context for instr:   %call1 = tail call i32 @_Z5funcAi
 ; INLINE-ALL-NEXT:   Callee context found: main:3 @ _Z5funcAi
 ; INLINE-ALL-NEXT: Marking context profile as inlined: main:3 @ _Z5funcAi
-; INLINE-ALL-NEXT: Getting callee context for instr:   %call = tail call i32 @_Z5funcBi(
-; INLINE-ALL-NEXT:   Callee context found: main:3.1 @ _Z5funcBi
 ; INLINE-ALL-NEXT: Getting callee context for instr:   %call.i = tail call i32 @_Z8funcLeafi
 ; INLINE-ALL-NEXT:   Callee context found: main:3 @ _Z5funcAi:1 @ _Z8funcLeafi
 ; INLINE-ALL-NEXT: Marking context profile as inlined: main:3 @ _Z5funcAi:1 @ _Z8funcLeafi
-; INLINE-ALL-NEXT: Getting callee context for instr:   %call = tail call i32 @_Z5funcBi
-; INLINE-ALL-NEXT:   Callee context found: main:3.1 @ _Z5funcBi
 ; INLINE-ALL-NEXT: Getting callee context for instr:   %call.i1 = tail call i32 @_Z3fibi
 ; INLINE-ALL-NEXT: Getting callee context for instr:   %call5.i = tail call i32 @_Z3fibi
 ; INLINE-ALL-NEXT: Getting base profile for function: _Z5funcAi
@@ -48,24 +44,23 @@
 ; INLINE-ALL-NEXT: Getting base profile for function: _Z8funcLeafi
 ; INLINE-ALL-NEXT:   Merging context profile into base profile: _Z8funcLeafi
 
-; Testwe we inlined the following in top-down order and promot rest not inlined context profile into base profile
-;   main:3 @ _Z5funcAi
+; Test we inlined the following in top-down order and promot rest not inlined context profile into base profile
 ;   _Z5funcAi:1 @ _Z8funcLeafi
 ;   _Z5funcBi:1 @ _Z8funcLeafi
 ; INLINE-HOT:      Getting base profile for function: main
 ; INLINE-HOT-NEXT:   Merging context profile into base profile: main
 ; INLINE-HOT-NEXT:   Found context tree root to promote: external:12 @ main
 ; INLINE-HOT-NEXT:   Context promoted and merged to: main
-; INLINE-HOT-NEXT: Getting callee context for instr:   %call = tail call i32 @_Z5funcBi(i32 %x.011), !dbg !58
+; INLINE-HOT-NEXT: Getting callee context for instr:   %call = tail call i32 @_Z5funcBi
 ; INLINE-HOT-NEXT:   Callee context found: main:3.1 @ _Z5funcBi
-; INLINE-HOT-NEXT: Getting callee context for instr:   %call1 = tail call i32 @_Z5funcAi(i32 %add), !dbg !63
+; INLINE-HOT-NEXT: Getting callee context for instr:   %call1 = tail call i32 @_Z5funcAi
 ; INLINE-HOT-NEXT:   Callee context found: main:3 @ _Z5funcAi
 ; INLINE-HOT-NEXT: Getting base profile for function: _Z5funcAi
 ; INLINE-HOT-NEXT:   Merging context profile into base profile: _Z5funcAi
 ; INLINE-HOT-NEXT:   Found context tree root to promote: main:3 @ _Z5funcAi
 ; INLINE-HOT-NEXT:   Context promoted to: _Z5funcAi
 ; INLINE-HOT-NEXT:   Context promoted to: _Z5funcAi:1 @ _Z8funcLeafi
-; INLINE-HOT-NEXT: Getting callee context for instr:   %call = tail call i32 @_Z8funcLeafi(i32 %add), !dbg !50
+; INLINE-HOT-NEXT:   Getting callee context for instr:   %call = tail call i32 @_Z8funcLeafi(i32 %add), !dbg !50
 ; INLINE-HOT-NEXT:   Callee context found: _Z5funcAi:1 @ _Z8funcLeafi
 ; INLINE-HOT-NEXT: Marking context profile as inlined: _Z5funcAi:1 @ _Z8funcLeafi
 ; INLINE-HOT-NEXT: Getting callee context for instr:   %call.i = tail call i32 @_Z3fibi(i32 %tmp.i) #2, !dbg !62
@@ -79,11 +74,11 @@
 ; INLINE-HOT-NEXT:   Context promoted to: _Z5funcBi:1 @ _Z8funcLeafi
 ; INLINE-HOT-NEXT:   Found context tree root to promote: externalA:17 @ _Z5funcBi
 ; INLINE-HOT-NEXT:   Context promoted and merged to: _Z5funcBi
-; INLINE-HOT-NEXT: Getting callee context for instr:   %call = tail call i32 @_Z8funcLeafi(i32 %sub), !dbg !50
+; INLINE-HOT-NEXT: Getting callee context for instr:   %call = tail call i32 @_Z8funcLeafi
 ; INLINE-HOT-NEXT:   Callee context found: _Z5funcBi:1 @ _Z8funcLeafi
 ; INLINE-HOT-NEXT: Marking context profile as inlined: _Z5funcBi:1 @ _Z8funcLeafi
-; INLINE-HOT-NEXT: Getting callee context for instr:   %call.i = tail call i32 @_Z3fibi(i32 %tmp.i) #2, !dbg !62
-; INLINE-HOT-NEXT: Getting callee context for instr:   %call5.i = tail call i32 @_Z3fibi(i32 %tmp1.i) #2, !dbg !69
+; INLINE-HOT-NEXT: Getting callee context for instr:   %call.i = tail call i32 @_Z3fibi
+; INLINE-HOT-NEXT: Getting callee context for instr:   %call5.i = tail call i32 @_Z3fibi
 ; INLINE-HOT-NEXT: Getting base profile for function: _Z8funcLeafi
 ; INLINE-HOT-NEXT:   Merging context profile into base profile: _Z8funcLeafi
 
diff --git a/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll b/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll
index adda7022047d..8d4e23829941 100644
--- a/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll
+++ b/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll
@@ -4,19 +4,18 @@
 ; RUN: llvm-profdata merge --sample --extbinary %S/Inputs/profile-context-tracker.prof -o %t
 
 ; Note that we need new pass manager to enable top-down processing for sample profile loader
-; Testwe we inlined the following in top-down order and entry counts accurate reflects post-inline base profile
+; Test we inlined the following in top-down order and entry counts accurate reflects post-inline base profile
 ;   main:3 @ _Z5funcAi
 ;   main:3 @ _Z5funcAi:1 @ _Z8funcLeafi
 ;   _Z5funcBi:1 @ _Z8funcLeafi
-; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL
-; RUN: opt < %s -passes=sample-profile -sample-profile-file=%t -sample-profile-inline-size -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL
-
-; Testwe we inlined the following in top-down order and entry counts accurate reflects post-inline base profile
-;   main:3 @ _Z5funcAi
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-prioritized-inline=0 -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%t -sample-profile-inline-size -sample-profile-prioritized-inline=0 -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-cold-inline-threshold=200 -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%t -sample-profile-inline-size -sample-profile-cold-inline-threshold=200 -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL
+;
+; Test we inlined the following in top-down order and entry counts accurate reflects post-inline base profile
 ;   _Z5funcAi:1 @ _Z8funcLeafi
 ;   _Z5funcBi:1 @ _Z8funcLeafi
-; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-HOT
-; RUN: opt < %s -passes=sample-profile -sample-profile-file=%t -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-HOT
 
 
 @factor = dso_local global i32 3, align 4, !dbg !0
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll
index a5033a0dc190..d47359fa0b5f 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll
@@ -1,8 +1,8 @@
-; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-inline.prof -S -pass-remarks=sample-profile -pass-remarks-output=%t.opt.yaml 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-inline.prof -S -pass-remarks=sample-profile -sample-profile-prioritized-inline=0 -pass-remarks-output=%t.opt.yaml 2>&1 | FileCheck %s
 ; RUN: FileCheck %s -check-prefix=YAML < %t.opt.yaml
 
 ; RUN: llvm-profdata merge --sample --extbinary %S/Inputs/pseudo-probe-inline.prof -o %t2
-; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-file=%t2 -S -pass-remarks=sample-profile -pass-remarks-output=%t2.opt.yaml 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-file=%t2 -S -pass-remarks=sample-profile -sample-profile-prioritized-inline=0 -pass-remarks-output=%t2.opt.yaml 2>&1 | FileCheck %s
 ; RUN: FileCheck %s -check-prefix=YAML < %t2.opt.yaml
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"

From c2f3f45b5c5bd6f9b86a766fc40130b34acb8293 Mon Sep 17 00:00:00 2001
From: Wenlei He <aktoon@gmail.com>
Date: Tue, 19 Jan 2021 23:29:14 -0800
Subject: [PATCH 056/318] [CSSPGO] Factor out common part for CSSPGO inline and
 AFDO inline

Refactoring SampleProfileLoader::inlineHotFunctions to use helpers from CSSPGO inlining and reduce similar code in the inlining loop, plus minor cleanup for AFDO path.

This is resubmit of D95024, with build break and overtighten assertion fixed.

Test Plan:

(cherry picked from commit 1645f465be85223e9f5b6303a3e5e0e491fd819f)
---
 llvm/lib/Transforms/IPO/SampleProfile.cpp     | 205 +++++++-----------
 .../SampleProfile/pseudo-probe-inline.ll      |   2 +-
 llvm/test/Transforms/SampleProfile/remarks.ll |   4 +-
 3 files changed, 80 insertions(+), 131 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 665c4078f3ee..2cfefd3a18ea 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -416,20 +416,18 @@ class SampleProfileLoader {
   findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const;
   mutable DenseMap<const DILocation *, const FunctionSamples *> DILocation2SampleMap;
   const FunctionSamples *findFunctionSamples(const Instruction &I) const;
-  CallBase *tryPromoteIndirectCall(Function &F, StringRef CalleeName,
-                                   uint64_t &Sum, uint64_t Count, CallBase *I,
-                                   const char *&Reason);
-  bool inlineCallInstruction(CallBase &CB,
-                             const FunctionSamples *CalleeSamples);
+  // Attempt to promote indirect call and also inline the promoted call
+  bool tryPromoteAndInlineCandidate(
+      Function &F, InlineCandidate &Candidate, uint64_t &Sum,
+      DenseSet<Instruction *> &PromotedInsns,
+      SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);
   bool inlineHotFunctions(Function &F,
                           DenseSet<GlobalValue::GUID> &InlinedGUIDs);
-  // Helper functions call-site prioritized BFS inliner
-  // Will change the main FDO inliner to be work list based directly in
-  // upstream, then merge this change with that and remove the duplication.
   InlineCost shouldInlineCandidate(InlineCandidate &Candidate);
   bool getInlineCandidate(InlineCandidate *NewCandidate, CallBase *CB);
-  bool tryInlineCandidate(InlineCandidate &Candidate,
-                          SmallVector<CallBase *, 8> &InlinedCallSites);
+  bool
+  tryInlineCandidate(InlineCandidate &Candidate,
+                     SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);
   bool
   inlineHotFunctionsWithPriority(Function &F,
                                  DenseSet<GlobalValue::GUID> &InlinedGUIDs);
@@ -1077,70 +1075,46 @@ SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
   return it.first->second;
 }
 
-CallBase *
-SampleProfileLoader::tryPromoteIndirectCall(Function &F, StringRef CalleeName,
-                                            uint64_t &Sum, uint64_t Count,
-                                            CallBase *I, const char *&Reason) {
-  Reason = "Callee function not available";
+/// Attempt to promote indirect call and also inline the promoted call.
+///
+/// \param F  Caller function.
+/// \param Candidate  ICP and inline candidate.
+/// \param Sum  Sum of target counts for indirect call.
+/// \param PromotedInsns  Map to keep track of indirect call already processed.
+/// \param Candidate  ICP and inline candidate.
+/// \param InlinedCallSite  Output vector for new call sites exposed after
+/// inlining.
+bool SampleProfileLoader::tryPromoteAndInlineCandidate(
+    Function &F, InlineCandidate &Candidate, uint64_t &Sum,
+    DenseSet<Instruction *> &PromotedInsns,
+    SmallVector<CallBase *, 8> *InlinedCallSite) {
+  const char *Reason = "Callee function not available";
   // R->getValue() != &F is to prevent promoting a recursive call.
   // If it is a recursive call, we do not inline it as it could bloat
   // the code exponentially. There is way to better handle this, e.g.
   // clone the caller first, and inline the cloned caller if it is
   // recursive. As llvm does not inline recursive calls, we will
   // simply ignore it instead of handling it explicitly.
-  auto R = SymbolMap.find(CalleeName);
+  auto R = SymbolMap.find(Candidate.CalleeSamples->getFuncName());
   if (R != SymbolMap.end() && R->getValue() &&
       !R->getValue()->isDeclaration() && R->getValue()->getSubprogram() &&
       R->getValue()->hasFnAttribute("use-sample-profile") &&
-      R->getValue() != &F && isLegalToPromote(*I, R->getValue(), &Reason)) {
+      R->getValue() != &F &&
+      isLegalToPromote(*Candidate.CallInstr, R->getValue(), &Reason)) {
     auto *DI =
-        &pgo::promoteIndirectCall(*I, R->getValue(), Count, Sum, false, ORE);
-    Sum -= Count;
-    return DI;
-  }
-  return nullptr;
-}
-
-bool SampleProfileLoader::inlineCallInstruction(
-    CallBase &CB, const FunctionSamples *CalleeSamples) {
-  if (ExternalInlineAdvisor) {
-    auto Advice = ExternalInlineAdvisor->getAdvice(CB);
-    if (!Advice->isInliningRecommended()) {
-      Advice->recordUnattemptedInlining();
-      return false;
+        &pgo::promoteIndirectCall(*Candidate.CallInstr, R->getValue(),
+                                  Candidate.CallsiteCount, Sum, false, ORE);
+    if (DI) {
+      Sum -= Candidate.CallsiteCount;
+      PromotedInsns.insert(Candidate.CallInstr);
+      Candidate.CallInstr = DI;
+      if (isa<CallInst>(DI) || isa<InvokeInst>(DI))
+        return tryInlineCandidate(Candidate, InlinedCallSite);
     }
-    // Dummy record, we don't use it for replay.
-    Advice->recordInlining();
-  }
-
-  Function *CalledFunction = CB.getCalledFunction();
-  assert(CalledFunction);
-  DebugLoc DLoc = CB.getDebugLoc();
-  BasicBlock *BB = CB.getParent();
-  InlineParams Params = getInlineParams();
-  Params.ComputeFullInlineCost = true;
-  // Checks if there is anything in the reachable portion of the callee at
-  // this callsite that makes this inlining potentially illegal. Need to
-  // set ComputeFullInlineCost, otherwise getInlineCost may return early
-  // when cost exceeds threshold without checking all IRs in the callee.
-  // The acutal cost does not matter because we only checks isNever() to
-  // see if it is legal to inline the callsite.
-  InlineCost Cost =
-      getInlineCost(CB, Params, GetTTI(*CalledFunction), GetAC, GetTLI);
-  if (Cost.isNever()) {
-    ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "InlineFail", DLoc, BB)
-              << "incompatible inlining");
-    return false;
-  }
-  InlineFunctionInfo IFI(nullptr, GetAC);
-  if (InlineFunction(CB, IFI).isSuccess()) {
-    // The call to InlineFunction erases I, so we can't pass it here.
-    emitInlinedInto(*ORE, DLoc, BB, *CalledFunction, *BB->getParent(), Cost,
-                    true, CSINLINE_DEBUG);
-    if (ProfileIsCS)
-      ContextTracker->markContextSamplesInlined(CalleeSamples);
-    ++NumCSInlined;
-    return true;
+  } else {
+    LLVM_DEBUG(dbgs() << "\nFailed to promote indirect call to "
+                      << Candidate.CalleeSamples->getFuncName() << " because "
+                      << Reason << "\n");
   }
   return false;
 }
@@ -1206,10 +1180,11 @@ bool SampleProfileLoader::inlineHotFunctions(
          "ProfAccForSymsInList should be false when profile-sample-accurate "
          "is enabled");
 
-  DenseMap<CallBase *, const FunctionSamples *> localNotInlinedCallSites;
+  DenseMap<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites;
   bool Changed = false;
-  while (true) {
-    bool LocalChanged = false;
+  bool LocalChanged = true;
+  while (LocalChanged) {
+    LocalChanged = false;
     SmallVector<CallBase *, 10> CIS;
     for (auto &BB : F) {
       bool Hot = false;
@@ -1223,7 +1198,7 @@ bool SampleProfileLoader::inlineHotFunctions(
                    "GUIDToFuncNameMap has to be populated");
             AllCandidates.push_back(CB);
             if (FS->getEntrySamples() > 0 || ProfileIsCS)
-              localNotInlinedCallSites.try_emplace(CB, FS);
+              LocalNotInlinedCallSites.try_emplace(CB, FS);
             if (callsiteIsHot(FS, PSI))
               Hot = true;
             else if (shouldInlineColdCallee(*CB))
@@ -1241,6 +1216,11 @@ bool SampleProfileLoader::inlineHotFunctions(
     }
     for (CallBase *I : CIS) {
       Function *CalledFunction = I->getCalledFunction();
+      InlineCandidate Candidate = {I,
+                                   LocalNotInlinedCallSites.count(I)
+                                       ? LocalNotInlinedCallSites[I]
+                                       : nullptr,
+                                   0 /* dummy count */};
       // Do not inline recursive calls.
       if (CalledFunction == &F)
         continue;
@@ -1257,30 +1237,16 @@ bool SampleProfileLoader::inlineHotFunctions(
           if (!callsiteIsHot(FS, PSI))
             continue;
 
-          const char *Reason = nullptr;
-          auto CalleeFunctionName = FS->getFuncName();
-          if (CallBase *DI =
-                  tryPromoteIndirectCall(F, CalleeFunctionName, Sum,
-                                         FS->getEntrySamples(), I, Reason)) {
-            PromotedInsns.insert(I);
-            // If profile mismatches, we should not attempt to inline DI.
-            if ((isa<CallInst>(DI) || isa<InvokeInst>(DI)) &&
-                inlineCallInstruction(cast<CallBase>(*DI), FS)) {
-              localNotInlinedCallSites.erase(I);
-              LocalChanged = true;
-            }
-          } else {
-            LLVM_DEBUG(dbgs()
-                       << "\nFailed to promote indirect call to "
-                       << CalleeFunctionName << " because " << Reason << "\n");
+          Candidate = {I, FS, FS->getEntrySamples()};
+          if (tryPromoteAndInlineCandidate(F, Candidate, Sum, PromotedInsns)) {
+            LocalNotInlinedCallSites.erase(I);
+            LocalChanged = true;
           }
         }
       } else if (CalledFunction && CalledFunction->getSubprogram() &&
                  !CalledFunction->isDeclaration()) {
-        if (inlineCallInstruction(*I, localNotInlinedCallSites.count(I)
-                                          ? localNotInlinedCallSites[I]
-                                          : nullptr)) {
-          localNotInlinedCallSites.erase(I);
+        if (tryInlineCandidate(Candidate)) {
+          LocalNotInlinedCallSites.erase(I);
           LocalChanged = true;
         }
       } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
@@ -1288,11 +1254,7 @@ bool SampleProfileLoader::inlineHotFunctions(
             InlinedGUIDs, F.getParent(), PSI->getOrCompHotCountThreshold());
       }
     }
-    if (LocalChanged) {
-      Changed = true;
-    } else {
-      break;
-    }
+    Changed |= LocalChanged;
   }
 
   // For CS profile, profile for not inlined context will be merged when
@@ -1301,7 +1263,7 @@ bool SampleProfileLoader::inlineHotFunctions(
     return Changed;
 
   // Accumulate not inlined callsite information into notInlinedSamples
-  for (const auto &Pair : localNotInlinedCallSites) {
+  for (const auto &Pair : LocalNotInlinedCallSites) {
     CallBase *I = Pair.getFirst();
     Function *Callee = I->getCalledFunction();
     if (!Callee || Callee->isDeclaration())
@@ -1347,7 +1309,7 @@ bool SampleProfileLoader::inlineHotFunctions(
 }
 
 bool SampleProfileLoader::tryInlineCandidate(
-    InlineCandidate &Candidate, SmallVector<CallBase *, 8> &InlinedCallSites) {
+    InlineCandidate &Candidate, SmallVector<CallBase *, 8> *InlinedCallSites) {
 
   CallBase &CB = *Candidate.CallInstr;
   Function *CalledFunction = CB.getCalledFunction();
@@ -1372,9 +1334,11 @@ bool SampleProfileLoader::tryInlineCandidate(
                     true, CSINLINE_DEBUG);
 
     // Now populate the list of newly exposed call sites.
-    InlinedCallSites.clear();
-    for (auto &I : IFI.InlinedCallSites)
-      InlinedCallSites.push_back(I);
+    if (InlinedCallSites) {
+      InlinedCallSites->clear();
+      for (auto &I : IFI.InlinedCallSites)
+        InlinedCallSites->push_back(I);
+    }
 
     if (ProfileIsCS)
       ContextTracker->markContextSamplesInlined(Candidate.CalleeSamples);
@@ -1409,8 +1373,6 @@ bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate,
 
 InlineCost
 SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
-  assert(ProfileIsCS && "Prioritiy based inliner only works with CSSPGO now");
-
   std::unique_ptr<InlineAdvice> Advice = nullptr;
   if (ExternalInlineAdvisor) {
     Advice = ExternalInlineAdvisor->getAdvice(*Candidate.CallInstr);
@@ -1446,18 +1408,16 @@ SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
   InlineCost Cost = getInlineCost(*Candidate.CallInstr, Callee, Params,
                                   GetTTI(*Callee), GetAC, GetTLI);
 
+  // Honor always inline and never inline from call analyzer
+  if (Cost.isNever() || Cost.isAlways())
+    return Cost;
+
   // For old FDO inliner, we inline the call site as long as cost is not
   // "Never". The cost-benefit check is done earlier.
   if (!CallsitePrioritizedInline) {
-    if (Cost.isNever())
-      return Cost;
-    return InlineCost::getAlways("hot callsite previously inlined");
+    return InlineCost::get(Cost.getCost(), INT_MAX);
   }
 
-  // Honor always inline and never inline from call analyzer
-  if (Cost.isNever() || Cost.isAlways())
-    return Cost;
-
   // Otherwise only use the cost from call analyzer, but overwite threshold with
   // Sample PGO threshold.
   return InlineCost::get(Cost.getCost(), SampleThreshold);
@@ -1542,34 +1502,23 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority(
         // fixed, but we generate different types).
         if (!PSI->isHotCount(EntryCountDistributed))
           break;
-        const char *Reason = nullptr;
-        auto CalleeFunctionName = FS->getFuncName();
-        if (CallBase *DI = tryPromoteIndirectCall(
-                F, CalleeFunctionName, Sum, EntryCountDistributed, I, Reason)) {
-          // Attach function profile for promoted indirect callee, and update
-          // call site count for the promoted inline candidate too.
-          Candidate = {DI, FS, EntryCountDistributed};
-          PromotedInsns.insert(I);
-          SmallVector<CallBase *, 8> InlinedCallSites;
-          // If profile mismatches, we should not attempt to inline DI.
-          if ((isa<CallInst>(DI) || isa<InvokeInst>(DI)) &&
-              tryInlineCandidate(Candidate, InlinedCallSites)) {
-            for (auto *CB : InlinedCallSites) {
-              if (getInlineCandidate(&NewCandidate, CB))
-                CQueue.emplace(NewCandidate);
-            }
-            Changed = true;
+        SmallVector<CallBase *, 8> InlinedCallSites;
+        // Attach function profile for promoted indirect callee, and update
+        // call site count for the promoted inline candidate too.
+        Candidate = {I, FS, EntryCountDistributed};
+        if (tryPromoteAndInlineCandidate(F, Candidate, Sum, PromotedInsns,
+                                         &InlinedCallSites)) {
+          for (auto *CB : InlinedCallSites) {
+            if (getInlineCandidate(&NewCandidate, CB))
+              CQueue.emplace(NewCandidate);
           }
-        } else {
-          LLVM_DEBUG(dbgs()
-                     << "\nFailed to promote indirect call to "
-                     << CalleeFunctionName << " because " << Reason << "\n");
+          Changed = true;
         }
       }
     } else if (CalledFunction && CalledFunction->getSubprogram() &&
                !CalledFunction->isDeclaration()) {
       SmallVector<CallBase *, 8> InlinedCallSites;
-      if (tryInlineCandidate(Candidate, InlinedCallSites)) {
+      if (tryInlineCandidate(Candidate, &InlinedCallSites)) {
         for (auto *CB : InlinedCallSites) {
           if (getInlineCandidate(&NewCandidate, CB))
             CQueue.emplace(NewCandidate);
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll
index d47359fa0b5f..5359fd4da067 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll
@@ -89,7 +89,7 @@ if.end:
 ;YAML-NEXT:    - String:          '(cost='
 ;YAML-NEXT:    - Cost:            '15'
 ;YAML-NEXT:    - String:          ', threshold='
-;YAML-NEXT:    - Threshold:       '225'
+;YAML-NEXT:    - Threshold:       '2147483647'
 ;YAML-NEXT:    - String:          ')'
 ;YAML-NEXT:    - String:          ' at callsite '
 ;YAML-NEXT:    - String:          foo
diff --git a/llvm/test/Transforms/SampleProfile/remarks.ll b/llvm/test/Transforms/SampleProfile/remarks.ll
index 3add1e74abaa..46f016433b20 100644
--- a/llvm/test/Transforms/SampleProfile/remarks.ll
+++ b/llvm/test/Transforms/SampleProfile/remarks.ll
@@ -21,7 +21,7 @@
 
 ; We are expecting foo() to be inlined in main() (almost all the cycles are
 ; spent inside foo).
-; CHECK: remark: remarks.cc:13:21: _Z3foov inlined into main to match profiling context with (cost=130, threshold=225) at callsite main:0:21;
+; CHECK: remark: remarks.cc:13:21: _Z3foov inlined into main to match profiling context with (cost=130, threshold=2147483647) at callsite main:0:21;
 ; CHECK: remark: remarks.cc:9:19: rand inlined into main to match profiling context with (cost=always): always inline attribute at callsite _Z3foov:6:19 @ main:0:21;
 
 ; The back edge for the loop is the hottest edge in the loop subgraph.
@@ -47,7 +47,7 @@
 ;YAML-NEXT:    - String:          '(cost='
 ;YAML-NEXT:    - Cost:            '130'
 ;YAML-NEXT:    - String:          ', threshold='
-;YAML-NEXT:    - Threshold:       '225'
+;YAML-NEXT:    - Threshold:       '2147483647'
 ;YAML-NEXT:    - String:          ')'
 ;YAML-NEXT:    - String:          ' at callsite '
 ;YAML-NEXT:    - String:          main

From a9157c5628dc89b13936bbc8eef261cb02d63d40 Mon Sep 17 00:00:00 2001
From: Hongtao Yu <hoy@fb.com>
Date: Fri, 11 Dec 2020 12:18:31 -0800
Subject: [PATCH 057/318] [CSSPGO] Introducing distribution factor for pseudo
 probe.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sample re-annotation is required in LTO time to achieve a reasonable post-inline profile quality. However, we have seen that such LTO-time re-annotation degrades profile quality. This is mainly caused by preLTO code duplication that is done by passes such as loop unrolling, jump threading, indirect call promotion etc, where samples corresponding to a source location are aggregated multiple times due to the duplicates. In this change we are introducing a concept of distribution factor for pseudo probes so that samples can be distributed for duplicated probes scaled by a factor. We hope that optimizations duplicating code well-maintain the branch frequency information (BFI) based on which probe distribution factors are calculated. Distribution factors are updated at the end of preLTO pipeline to reflect an estimated portion of the real execution count.

This change also introduces a pseudo probe verifier that can be run after each IR passes to detect duplicated pseudo probes.

A saturated distribution factor stands for 1.0. A pesudo probe will carry a factor with the value ranged from 0.0 to 1.0. A 64-bit integral distribution factor field that represents [0.0, 1.0] is associated to each block probe. Unfortunately this cannot be done for callsite probes due to the size limitation of a 32-bit Dwarf discriminator. A 7-bit distribution factor is used instead.

Changes are also needed to the sample profile inliner to deal with prorated callsite counts. Call sites duplicated by PreLTO passes, when later on inlined in LTO time, should have the callees’s probe prorated based on the Prelink-computed distribution factors. The distribution factors should also be taken into account when computing hotness for inline candidates. Also, Indirect call promotion results in multiple callisites. The original samples should be distributed across them. This is fixed by adjusting the callisites' distribution factors.

Reviewed By: wmi

Differential Revision: https://reviews.llvm.org/D93264

(cherry picked from commit 3d89b3cbec230633e8228787819b15116c1a1730)
---
 clang/test/CodeGen/pseudo-probe-emit.c        |   8 +-
 llvm/include/llvm/IR/IntrinsicInst.h          |   8 +-
 llvm/include/llvm/IR/Intrinsics.td            |   2 +-
 llvm/include/llvm/IR/PseudoProbe.h            |  27 ++-
 .../llvm/Passes/StandardInstrumentations.h    |   2 +
 llvm/include/llvm/ProfileData/SampleProf.h    |  10 ++
 .../llvm/Transforms/IPO/SampleProfileProbe.h  |  41 +++++
 llvm/lib/IR/PseudoProbe.cpp                   |  41 +++++
 llvm/lib/Passes/PassBuilder.cpp               |   6 +
 llvm/lib/Passes/PassRegistry.def              |   1 +
 llvm/lib/Passes/StandardInstrumentations.cpp  |   1 +
 llvm/lib/Transforms/IPO/SampleProfile.cpp     | 108 +++++++++---
 .../lib/Transforms/IPO/SampleProfileProbe.cpp | 162 +++++++++++++++++-
 .../Inputs/pseudo-probe-update.prof           |   8 +
 .../SampleProfile/pseudo-probe-emit-inline.ll |  20 +--
 .../SampleProfile/pseudo-probe-emit.ll        |  22 ++-
 .../SampleProfile/pseudo-probe-inline.ll      |  38 ++--
 .../SampleProfile/pseudo-probe-profile.ll     |  42 ++++-
 .../SampleProfile/pseudo-probe-update.ll      |  45 +++++
 .../SampleProfile/pseudo-probe-verify.ll      |  77 +++++++++
 20 files changed, 595 insertions(+), 74 deletions(-)
 create mode 100644 llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-update.prof
 create mode 100644 llvm/test/Transforms/SampleProfile/pseudo-probe-update.ll
 create mode 100644 llvm/test/Transforms/SampleProfile/pseudo-probe-verify.ll

diff --git a/clang/test/CodeGen/pseudo-probe-emit.c b/clang/test/CodeGen/pseudo-probe-emit.c
index 059673b6992e..fccc8f04844d 100644
--- a/clang/test/CodeGen/pseudo-probe-emit.c
+++ b/clang/test/CodeGen/pseudo-probe-emit.c
@@ -6,12 +6,12 @@ void bar();
 void go();
 
 void foo(int x) {
-  // CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 1, i32 0)
+  // CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 1, i32 0, i64 -1)
   if (x == 0)
-    // CHECK: call void @llvm.pseudoprobe(i64 [[#GUID]], i64 2, i32 0)
+    // CHECK: call void @llvm.pseudoprobe(i64 [[#GUID]], i64 2, i32 0, i64 -1)
     bar();
   else
-    // CHECK: call void @llvm.pseudoprobe(i64 [[#GUID]], i64 3, i32 0)
+    // CHECK: call void @llvm.pseudoprobe(i64 [[#GUID]], i64 3, i32 0, i64 -1)
     go();
-  // CHECK: call void @llvm.pseudoprobe(i64 [[#GUID]], i64 4, i32 0)
+  // CHECK: call void @llvm.pseudoprobe(i64 [[#GUID]], i64 4, i32 0, i64 -1)
 }
diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index 9d68f3fdde6c..df3a1d568756 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -981,12 +981,16 @@ class PseudoProbeInst : public IntrinsicInst {
     return cast<ConstantInt>(const_cast<Value *>(getArgOperand(0)));
   }
 
+  ConstantInt *getIndex() const {
+    return cast<ConstantInt>(const_cast<Value *>(getArgOperand(1)));
+  }
+
   ConstantInt *getAttributes() const {
     return cast<ConstantInt>(const_cast<Value *>(getArgOperand(2)));
   }
 
-  ConstantInt *getIndex() const {
-    return cast<ConstantInt>(const_cast<Value *>(getArgOperand(1)));
+  ConstantInt *getFactor() const {
+    return cast<ConstantInt>(const_cast<Value *>(getArgOperand(3)));
   }
 };
 
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index b2bfc6e6f9e6..21307ed1bd91 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1298,7 +1298,7 @@ def int_sideeffect : DefaultAttrsIntrinsic<[], [], [IntrInaccessibleMemOnly, Int
 // Like the sideeffect intrinsic defined above, this intrinsic is treated by the 
 // optimizer as having opaque side effects so that it won't be get rid of or moved 
 // out of the block it probes.
-def int_pseudoprobe : Intrinsic<[], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty],
+def int_pseudoprobe : Intrinsic<[], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i64_ty],
                                     [IntrInaccessibleMemOnly, IntrWillReturn]>;
 
 // Intrinsics to support half precision floating point format
diff --git a/llvm/include/llvm/IR/PseudoProbe.h b/llvm/include/llvm/IR/PseudoProbe.h
index e0370c264102..5165e80caa2d 100644
--- a/llvm/include/llvm/IR/PseudoProbe.h
+++ b/llvm/include/llvm/IR/PseudoProbe.h
@@ -16,28 +16,39 @@
 #include "llvm/ADT/Optional.h"
 #include <cassert>
 #include <cstdint>
+#include <limits>
 
 namespace llvm {
 
 class Instruction;
+class BasicBlock;
 
 constexpr const char *PseudoProbeDescMetadataName = "llvm.pseudo_probe_desc";
 
 enum class PseudoProbeType { Block = 0, IndirectCall, DirectCall };
 
+// The saturated distrution factor representing 100% for block probes.
+constexpr static uint64_t PseudoProbeFullDistributionFactor =
+    std::numeric_limits<uint64_t>::max();
+
 struct PseudoProbeDwarfDiscriminator {
+public:
   // The following APIs encodes/decodes per-probe information to/from a
   // 32-bit integer which is organized as:
   //  [2:0] - 0x7, this is reserved for regular discriminator,
   //          see DWARF discriminator encoding rule
   //  [18:3] - probe id
-  //  [25:19] - reserved
+  //  [25:19] - probe distribution factor
   //  [28:26] - probe type, see PseudoProbeType
   //  [31:29] - reserved for probe attributes
-  static uint32_t packProbeData(uint32_t Index, uint32_t Type) {
+  static uint32_t packProbeData(uint32_t Index, uint32_t Type, uint32_t Flags,
+                                uint32_t Factor) {
     assert(Index <= 0xFFFF && "Probe index too big to encode, exceeding 2^16");
     assert(Type <= 0x7 && "Probe type too big to encode, exceeding 7");
-    return (Index << 3) | (Type << 26) | 0x7;
+    assert(Flags <= 0x7);
+    assert(Factor <= 100 &&
+           "Probe distribution factor too big to encode, exceeding 100");
+    return (Index << 3) | (Factor << 19) | (Type << 26) | 0x7;
   }
 
   static uint32_t extractProbeIndex(uint32_t Value) {
@@ -51,16 +62,26 @@ struct PseudoProbeDwarfDiscriminator {
   static uint32_t extractProbeAttributes(uint32_t Value) {
     return (Value >> 29) & 0x7;
   }
+
+  static uint32_t extractProbeFactor(uint32_t Value) {
+    return (Value >> 19) & 0x7F;
+  }
+
+  // The saturated distrution factor representing 100% for callsites.
+  constexpr static uint8_t FullDistributionFactor = 100;
 };
 
 struct PseudoProbe {
   uint32_t Id;
   uint32_t Type;
   uint32_t Attr;
+  float Factor;
 };
 
 Optional<PseudoProbe> extractProbe(const Instruction &Inst);
 
+void setProbeDistributionFactor(Instruction &Inst, float Factor);
+
 } // end namespace llvm
 
 #endif // LLVM_IR_PSEUDOPROBE_H
diff --git a/llvm/include/llvm/Passes/StandardInstrumentations.h b/llvm/include/llvm/Passes/StandardInstrumentations.h
index 795a980878e2..61c86b0468f2 100644
--- a/llvm/include/llvm/Passes/StandardInstrumentations.h
+++ b/llvm/include/llvm/Passes/StandardInstrumentations.h
@@ -22,6 +22,7 @@
 #include "llvm/IR/PassTimingInfo.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/IPO/SampleProfileProbe.h"
 
 #include <string>
 #include <utility>
@@ -273,6 +274,7 @@ class StandardInstrumentations {
   OptBisectInstrumentation OptBisect;
   PreservedCFGCheckerInstrumentation PreservedCFGChecker;
   IRChangedPrinter PrintChangedIR;
+  PseudoProbeVerifier PseudoProbeVerification;
   VerifyInstrumentation Verify;
 
   bool VerifyEach;
diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h
index 346bc4c81d86..25d5b2376c11 100644
--- a/llvm/include/llvm/ProfileData/SampleProf.h
+++ b/llvm/include/llvm/ProfileData/SampleProf.h
@@ -347,6 +347,16 @@ class SampleRecord {
     return SortedTargets;
   }
 
+  /// Prorate call targets by a distribution factor.
+  static const CallTargetMap adjustCallTargets(const CallTargetMap &Targets,
+                                               float DistributionFactor) {
+    CallTargetMap AdjustedTargets;
+    for (const auto &I : Targets) {
+      AdjustedTargets[I.first()] = I.second * DistributionFactor;
+    }
+    return AdjustedTargets;
+  }
+
   /// Merge the samples in \p Other into this record.
   /// Optionally scale sample counts by \p Weight.
   sampleprof_error merge(const SampleRecord &Other, uint64_t Weight = 1) {
diff --git a/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h b/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h
index 78117fd4a9c2..cab893b50d19 100644
--- a/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h
+++ b/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h
@@ -16,6 +16,10 @@
 #define LLVM_TRANSFORMS_IPO_SAMPLEPROFILEPROBE_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/LazyCallGraph.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/PassInstrumentation.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/PseudoProbe.h"
 #include "llvm/ProfileData/SampleProf.h"
@@ -29,6 +33,8 @@ class Module;
 using namespace sampleprof;
 using BlockIdMap = std::unordered_map<BasicBlock *, uint32_t>;
 using InstructionIdMap = std::unordered_map<Instruction *, uint32_t>;
+using ProbeFactorMap = std::unordered_map<uint64_t, float>;
+using FuncProbeFactorMap = StringMap<ProbeFactorMap>;
 
 enum class PseudoProbeReservedId { Invalid = 0, Last = Invalid };
 
@@ -43,6 +49,33 @@ class PseudoProbeDescriptor {
   uint64_t getFunctionHash() const { return FunctionHash; }
 };
 
+// A pseudo probe verifier that can be run after each IR passes to detect the
+// violation of updating probe factors. In principle, the sum of distribution
+// factor for a probe should be identical before and after a pass. For a
+// function pass, the factor sum for a probe would be typically 100%.
+class PseudoProbeVerifier {
+public:
+  void registerCallbacks(PassInstrumentationCallbacks &PIC);
+
+  // Implementation of pass instrumentation callbacks for new pass manager.
+  void runAfterPass(StringRef PassID, Any IR);
+
+private:
+  // Allow a little bias due the rounding to integral factors.
+  constexpr static float DistributionFactorVariance = 0.02;
+  // Distribution factors from last pass.
+  FuncProbeFactorMap FunctionProbeFactors;
+
+  void collectProbeFactors(const BasicBlock *BB, ProbeFactorMap &ProbeFactors);
+  void runAfterPass(const Module *M);
+  void runAfterPass(const LazyCallGraph::SCC *C);
+  void runAfterPass(const Function *F);
+  void runAfterPass(const Loop *L);
+  bool shouldVerifyFunction(const Function *F);
+  void verifyProbeFactors(const Function *F,
+                          const ProbeFactorMap &ProbeFactors);
+};
+
 // This class serves sample counts correlation for SampleProfileLoader by
 // analyzing pseudo probes and their function descriptors injected by
 // SampleProfileProber.
@@ -102,5 +135,13 @@ class SampleProfileProbePass : public PassInfoMixin<SampleProfileProbePass> {
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
 };
 
+class PseudoProbeUpdatePass : public PassInfoMixin<PseudoProbeUpdatePass> {
+  void runOnFunction(Function &F, FunctionAnalysisManager &FAM);
+
+public:
+  PseudoProbeUpdatePass() {}
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
 } // end namespace llvm
 #endif // LLVM_TRANSFORMS_IPO_SAMPLEPROFILEPROBE_H
diff --git a/llvm/lib/IR/PseudoProbe.cpp b/llvm/lib/IR/PseudoProbe.cpp
index 804214f06e7a..80d2963938d4 100644
--- a/llvm/lib/IR/PseudoProbe.cpp
+++ b/llvm/lib/IR/PseudoProbe.cpp
@@ -35,6 +35,9 @@ Optional<PseudoProbe> extractProbeFromDiscriminator(const Instruction &Inst) {
           PseudoProbeDwarfDiscriminator::extractProbeType(Discriminator);
       Probe.Attr =
           PseudoProbeDwarfDiscriminator::extractProbeAttributes(Discriminator);
+      Probe.Factor =
+          PseudoProbeDwarfDiscriminator::extractProbeFactor(Discriminator) /
+          (float)PseudoProbeDwarfDiscriminator::FullDistributionFactor;
       return Probe;
     }
   }
@@ -47,6 +50,8 @@ Optional<PseudoProbe> extractProbe(const Instruction &Inst) {
     Probe.Id = II->getIndex()->getZExtValue();
     Probe.Type = (uint32_t)PseudoProbeType::Block;
     Probe.Attr = II->getAttributes()->getZExtValue();
+    Probe.Factor = II->getFactor()->getZExtValue() /
+                   (float)PseudoProbeFullDistributionFactor;
     return Probe;
   }
 
@@ -55,4 +60,40 @@ Optional<PseudoProbe> extractProbe(const Instruction &Inst) {
 
   return None;
 }
+
+void setProbeDistributionFactor(Instruction &Inst, float Factor) {
+  assert(Factor >= 0 && Factor <= 1 &&
+         "Distribution factor must be in [0, 1.0]");
+  if (auto *II = dyn_cast<PseudoProbeInst>(&Inst)) {
+    IRBuilder<> Builder(&Inst);
+    uint64_t IntFactor = PseudoProbeFullDistributionFactor;
+    if (Factor < 1)
+      IntFactor *= Factor;
+    auto OrigFactor = II->getFactor()->getZExtValue();
+    if (IntFactor != OrigFactor)
+      II->replaceUsesOfWith(II->getFactor(), Builder.getInt64(IntFactor));
+  } else if (isa<CallBase>(&Inst) && !isa<IntrinsicInst>(&Inst)) {
+    if (const DebugLoc &DLoc = Inst.getDebugLoc()) {
+      const DILocation *DIL = DLoc;
+      auto Discriminator = DIL->getDiscriminator();
+      if (DILocation::isPseudoProbeDiscriminator(Discriminator)) {
+        auto Index =
+            PseudoProbeDwarfDiscriminator::extractProbeIndex(Discriminator);
+        auto Type =
+            PseudoProbeDwarfDiscriminator::extractProbeType(Discriminator);
+        auto Attr = PseudoProbeDwarfDiscriminator::extractProbeAttributes(
+            Discriminator);
+        // Round small factors to 0 to avoid over-counting.
+        uint32_t IntFactor =
+            PseudoProbeDwarfDiscriminator::FullDistributionFactor;
+        if (Factor < 1)
+          IntFactor *= Factor;
+        uint32_t V = PseudoProbeDwarfDiscriminator::packProbeData(
+            Index, Type, Attr, IntFactor);
+        DIL = DIL->cloneWithDiscriminator(V);
+        Inst.setDebugLoc(DIL);
+      }
+    }
+  }
+}
 } // namespace llvm
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index d4c4c6e01ef5..6c1a7c75d30a 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -1423,6 +1423,9 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
   // Now add the optimization pipeline.
   MPM.addPass(buildModuleOptimizationPipeline(Level, LTOPreLink));
 
+  if (PGOOpt && PGOOpt->PseudoProbeForProfiling)
+    MPM.addPass(PseudoProbeUpdatePass());
+
   // Emit annotation remarks.
   addAnnotationRemarksPass(MPM);
 
@@ -1477,6 +1480,9 @@ PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) {
   if (PTO.Coroutines)
     MPM.addPass(createModuleToFunctionPassAdaptor(CoroCleanupPass()));
 
+  if (PGOOpt && PGOOpt->PseudoProbeForProfiling)
+    MPM.addPass(PseudoProbeUpdatePass());
+
   // Emit annotation remarks.
   addAnnotationRemarksPass(MPM);
 
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 860bfade733d..877cb9ed13b3 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -119,6 +119,7 @@ MODULE_PASS("kasan-module", ModuleAddressSanitizerPass(/*CompileKernel=*/true, f
 MODULE_PASS("sancov-module", ModuleSanitizerCoveragePass())
 MODULE_PASS("memprof-module", ModuleMemProfilerPass())
 MODULE_PASS("poison-checking", PoisonCheckingPass())
+MODULE_PASS("pseudo-probe-update", PseudoProbeUpdatePass())
 #undef MODULE_PASS
 
 #ifndef CGSCC_ANALYSIS
diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp
index a8bfe02d4432..6795aed7b04e 100644
--- a/llvm/lib/Passes/StandardInstrumentations.cpp
+++ b/llvm/lib/Passes/StandardInstrumentations.cpp
@@ -882,6 +882,7 @@ void StandardInstrumentations::registerCallbacks(
   OptBisect.registerCallbacks(PIC);
   PreservedCFGChecker.registerCallbacks(PIC);
   PrintChangedIR.registerCallbacks(PIC);
+  PseudoProbeVerification.registerCallbacks(PIC);
   if (VerifyEach)
     Verify.registerCallbacks(PIC);
 }
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 2cfefd3a18ea..b2a9127773c3 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -108,6 +108,8 @@ STATISTIC(NumCSNotInlined,
 STATISTIC(NumMismatchedProfile,
           "Number of functions with CFG mismatched profile");
 STATISTIC(NumMatchedProfile, "Number of functions with CFG matched profile");
+STATISTIC(NumDuplicatedInlinesite,
+          "Number of inlined callsites with a partial distribution factor");
 
 STATISTIC(NumCSInlinedHitMinLimit,
           "Number of functions with FDO inline stopped due to min size limit");
@@ -358,7 +360,14 @@ class GUIDToFuncNameMapper {
 struct InlineCandidate {
   CallBase *CallInstr;
   const FunctionSamples *CalleeSamples;
+  // Prorated callsite count, which will be used to guide inlining. For example,
+  // if a callsite is duplicated in LTO prelink, then in LTO postlink the two
+  // copies will get their own distribution factors and their prorated counts
+  // will be used to decide if they should be inlined independently.
   uint64_t CallsiteCount;
+  // Call site distribution factor to prorate the profile samples for a
+  // duplicated callsite. Default value is 1.0.
+  float CallsiteDistribution;
 };
 
 // Inline candidate comparer using call site weight
@@ -418,8 +427,8 @@ class SampleProfileLoader {
   const FunctionSamples *findFunctionSamples(const Instruction &I) const;
   // Attempt to promote indirect call and also inline the promoted call
   bool tryPromoteAndInlineCandidate(
-      Function &F, InlineCandidate &Candidate, uint64_t &Sum,
-      DenseSet<Instruction *> &PromotedInsns,
+      Function &F, InlineCandidate &Candidate, uint64_t SumOrigin,
+      uint64_t &Sum, DenseSet<Instruction *> &PromotedInsns,
       SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);
   bool inlineHotFunctions(Function &F,
                           DenseSet<GlobalValue::GUID> &InlinedGUIDs);
@@ -886,7 +895,7 @@ ErrorOr<uint64_t> SampleProfileLoader::getProbeWeight(const Instruction &Inst) {
 
   const ErrorOr<uint64_t> &R = FS->findSamplesAt(Probe->Id, 0);
   if (R) {
-    uint64_t Samples = R.get();
+    uint64_t Samples = R.get() * Probe->Factor;
     bool FirstMark = CoverageTracker.markSamplesUsed(FS, Probe->Id, 0, Samples);
     if (FirstMark) {
       ORE->emit([&]() {
@@ -894,13 +903,17 @@ ErrorOr<uint64_t> SampleProfileLoader::getProbeWeight(const Instruction &Inst) {
         Remark << "Applied " << ore::NV("NumSamples", Samples);
         Remark << " samples from profile (ProbeId=";
         Remark << ore::NV("ProbeId", Probe->Id);
+        Remark << ", Factor=";
+        Remark << ore::NV("Factor", Probe->Factor);
+        Remark << ", OriginalSamples=";
+        Remark << ore::NV("OriginalSamples", R.get());
         Remark << ")";
         return Remark;
       });
     }
-
     LLVM_DEBUG(dbgs() << "    " << Probe->Id << ":" << Inst
-                      << " - weight: " << R.get() << ")\n");
+                      << " - weight: " << R.get() << " - factor: "
+                      << format("%0.2f", Probe->Factor) << ")\n");
     return Samples;
   }
   return R;
@@ -1085,7 +1098,7 @@ SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
 /// \param InlinedCallSite  Output vector for new call sites exposed after
 /// inlining.
 bool SampleProfileLoader::tryPromoteAndInlineCandidate(
-    Function &F, InlineCandidate &Candidate, uint64_t &Sum,
+    Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, uint64_t &Sum,
     DenseSet<Instruction *> &PromotedInsns,
     SmallVector<CallBase *, 8> *InlinedCallSite) {
   const char *Reason = "Callee function not available";
@@ -1106,10 +1119,28 @@ bool SampleProfileLoader::tryPromoteAndInlineCandidate(
                                   Candidate.CallsiteCount, Sum, false, ORE);
     if (DI) {
       Sum -= Candidate.CallsiteCount;
+      // Prorate the indirect callsite distribution.
+      // Do not update the promoted direct callsite distribution at this
+      // point since the original distribution combined with the callee
+      // profile will be used to prorate callsites from the callee if
+      // inlined. Once not inlined, the direct callsite distribution should
+      // be prorated so that the it will reflect the real callsite counts.
+      setProbeDistributionFactor(*Candidate.CallInstr,
+                                 Candidate.CallsiteDistribution * Sum /
+                                     SumOrigin);
       PromotedInsns.insert(Candidate.CallInstr);
       Candidate.CallInstr = DI;
-      if (isa<CallInst>(DI) || isa<InvokeInst>(DI))
-        return tryInlineCandidate(Candidate, InlinedCallSite);
+      if (isa<CallInst>(DI) || isa<InvokeInst>(DI)) {
+        bool Inlined = tryInlineCandidate(Candidate, InlinedCallSite);
+        if (!Inlined) {
+          // Prorate the direct callsite distribution so that it reflects real
+          // callsite counts.
+          setProbeDistributionFactor(*DI, Candidate.CallsiteDistribution *
+                                              Candidate.CallsiteCount /
+                                              SumOrigin);
+        }
+        return Inlined;
+      }
     }
   } else {
     LLVM_DEBUG(dbgs() << "\nFailed to promote indirect call to "
@@ -1216,11 +1247,11 @@ bool SampleProfileLoader::inlineHotFunctions(
     }
     for (CallBase *I : CIS) {
       Function *CalledFunction = I->getCalledFunction();
-      InlineCandidate Candidate = {I,
-                                   LocalNotInlinedCallSites.count(I)
-                                       ? LocalNotInlinedCallSites[I]
-                                       : nullptr,
-                                   0 /* dummy count */};
+      InlineCandidate Candidate = {
+          I,
+          LocalNotInlinedCallSites.count(I) ? LocalNotInlinedCallSites[I]
+                                            : nullptr,
+          0 /* dummy count */, 1.0 /* dummy distribution factor */};
       // Do not inline recursive calls.
       if (CalledFunction == &F)
         continue;
@@ -1229,6 +1260,7 @@ bool SampleProfileLoader::inlineHotFunctions(
           continue;
         uint64_t Sum;
         for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) {
+          uint64_t SumOrigin = Sum;
           if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
             FS->findInlinedFunctions(InlinedGUIDs, F.getParent(),
                                      PSI->getOrCompHotCountThreshold());
@@ -1237,8 +1269,9 @@ bool SampleProfileLoader::inlineHotFunctions(
           if (!callsiteIsHot(FS, PSI))
             continue;
 
-          Candidate = {I, FS, FS->getEntrySamples()};
-          if (tryPromoteAndInlineCandidate(F, Candidate, Sum, PromotedInsns)) {
+          Candidate = {I, FS, FS->getEntrySamples(), 1.0};
+          if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum,
+                                           PromotedInsns)) {
             LocalNotInlinedCallSites.erase(I);
             LocalChanged = true;
           }
@@ -1343,6 +1376,23 @@ bool SampleProfileLoader::tryInlineCandidate(
     if (ProfileIsCS)
       ContextTracker->markContextSamplesInlined(Candidate.CalleeSamples);
     ++NumCSInlined;
+
+    // Prorate inlined probes for a duplicated inlining callsite which probably
+    // has a distribution less than 100%. Samples for an inlinee should be
+    // distributed among the copies of the original callsite based on each
+    // callsite's distribution factor for counts accuracy. Note that an inlined
+    // probe may come with its own distribution factor if it has been duplicated
+    // in the inlinee body. The two factor are multiplied to reflect the
+    // aggregation of duplication.
+    if (Candidate.CallsiteDistribution < 1) {
+      for (auto &I : IFI.InlinedCallSites) {
+        if (Optional<PseudoProbe> Probe = extractProbe(*I))
+          setProbeDistributionFactor(*I, Probe->Factor *
+                                             Candidate.CallsiteDistribution);
+      }
+      NumDuplicatedInlinesite++;
+    }
+
     return true;
   }
   return false;
@@ -1360,14 +1410,19 @@ bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate,
   if (!CalleeSamples)
     return false;
 
+  float Factor = 1.0;
+  if (Optional<PseudoProbe> Probe = extractProbe(*CB))
+    Factor = Probe->Factor;
+
   uint64_t CallsiteCount = 0;
   ErrorOr<uint64_t> Weight = getBlockWeight(CB->getParent());
   if (Weight)
     CallsiteCount = Weight.get();
   if (CalleeSamples)
-    CallsiteCount = std::max(CallsiteCount, CalleeSamples->getEntrySamples());
+    CallsiteCount = std::max(
+        CallsiteCount, uint64_t(CalleeSamples->getEntrySamples() * Factor));
 
-  *NewCandidate = {CB, CalleeSamples, CallsiteCount};
+  *NewCandidate = {CB, CalleeSamples, CallsiteCount, Factor};
   return true;
 }
 
@@ -1479,6 +1534,7 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority(
       uint64_t Sum;
       auto CalleeSamples = findIndirectCallFunctionSamples(*I, Sum);
       uint64_t SumOrigin = Sum;
+      Sum *= Candidate.CallsiteDistribution;
       for (const auto *FS : CalleeSamples) {
         // TODO: Consider disable pre-lTO ICP for MonoLTO as well
         if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
@@ -1486,7 +1542,8 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority(
                                    PSI->getOrCompHotCountThreshold());
           continue;
         }
-        uint64_t EntryCountDistributed = FS->getEntrySamples();
+        uint64_t EntryCountDistributed =
+            FS->getEntrySamples() * Candidate.CallsiteDistribution;
         // In addition to regular inline cost check, we also need to make sure
         // ICP isn't introducing excessive speculative checks even if individual
         // target looks beneficial to promote and inline. That means we should
@@ -1505,9 +1562,10 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority(
         SmallVector<CallBase *, 8> InlinedCallSites;
         // Attach function profile for promoted indirect callee, and update
         // call site count for the promoted inline candidate too.
-        Candidate = {I, FS, EntryCountDistributed};
-        if (tryPromoteAndInlineCandidate(F, Candidate, Sum, PromotedInsns,
-                                         &InlinedCallSites)) {
+        Candidate = {I, FS, EntryCountDistributed,
+                     Candidate.CallsiteDistribution};
+        if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum,
+                                         PromotedInsns, &InlinedCallSites)) {
           for (auto *CB : InlinedCallSites) {
             if (getInlineCandidate(&NewCandidate, CB))
               CQueue.emplace(NewCandidate);
@@ -1965,6 +2023,14 @@ void SampleProfileLoader::propagateWeights(Function &F) {
           auto T = FS->findCallTargetMapAt(CallSite);
           if (!T || T.get().empty())
             continue;
+          // Prorate the callsite counts to reflect what is already done to the
+          // callsite, such as ICP or calliste cloning.
+          if (FunctionSamples::ProfileIsProbeBased) {
+            if (Optional<PseudoProbe> Probe = extractProbe(I)) {
+              if (Probe->Factor < 1)
+                T = SampleRecord::adjustCallTargets(T.get(), Probe->Factor);
+            }
+          }
           SmallVector<InstrProfValueData, 2> SortedCallTargets =
               GetSortedValueDataFromCallTargets(T.get());
           uint64_t Sum;
diff --git a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
index 7cecd20b78d8..a885c3ee4ded 100644
--- a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/Transforms/IPO/SampleProfileProbe.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
@@ -25,8 +26,10 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/ProfileData/SampleProf.h"
 #include "llvm/Support/CRC.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <unordered_set>
 #include <vector>
 
 using namespace llvm;
@@ -35,6 +38,115 @@ using namespace llvm;
 STATISTIC(ArtificialDbgLine,
           "Number of probes that have an artificial debug line");
 
+static cl::opt<bool>
+    VerifyPseudoProbe("verify-pseudo-probe", cl::init(false), cl::Hidden,
+                      cl::desc("Do pseudo probe verification"));
+
+static cl::list<std::string> VerifyPseudoProbeFuncList(
+    "verify-pseudo-probe-funcs", cl::Hidden,
+    cl::desc("The option to specify the name of the functions to verify."));
+
+static cl::opt<bool>
+    UpdatePseudoProbe("update-pseudo-probe", cl::init(true), cl::Hidden,
+                      cl::desc("Update pseudo probe distribution factor"));
+
+bool PseudoProbeVerifier::shouldVerifyFunction(const Function *F) {
+  // Skip function declaration.
+  if (F->isDeclaration())
+    return false;
+  // Skip function that will not be emitted into object file. The prevailing
+  // defintion will be verified instead.
+  if (F->hasAvailableExternallyLinkage())
+    return false;
+  // Do a name matching.
+  static std::unordered_set<std::string> VerifyFuncNames(
+      VerifyPseudoProbeFuncList.begin(), VerifyPseudoProbeFuncList.end());
+  return VerifyFuncNames.empty() || VerifyFuncNames.count(F->getName().str());
+}
+
+void PseudoProbeVerifier::registerCallbacks(PassInstrumentationCallbacks &PIC) {
+  if (VerifyPseudoProbe) {
+    PIC.registerAfterPassCallback(
+        [this](StringRef P, Any IR, const PreservedAnalyses &) {
+          this->runAfterPass(P, IR);
+        });
+  }
+}
+
+// Callback to run after each transformation for the new pass manager.
+void PseudoProbeVerifier::runAfterPass(StringRef PassID, Any IR) {
+  std::string Banner =
+      "\n*** Pseudo Probe Verification After " + PassID.str() + " ***\n";
+  dbgs() << Banner;
+  if (any_isa<const Module *>(IR))
+    runAfterPass(any_cast<const Module *>(IR));
+  else if (any_isa<const Function *>(IR))
+    runAfterPass(any_cast<const Function *>(IR));
+  else if (any_isa<const LazyCallGraph::SCC *>(IR))
+    runAfterPass(any_cast<const LazyCallGraph::SCC *>(IR));
+  else if (any_isa<const Loop *>(IR))
+    runAfterPass(any_cast<const Loop *>(IR));
+  else
+    llvm_unreachable("Unknown IR unit");
+}
+
+void PseudoProbeVerifier::runAfterPass(const Module *M) {
+  for (const Function &F : *M)
+    runAfterPass(&F);
+}
+
+void PseudoProbeVerifier::runAfterPass(const LazyCallGraph::SCC *C) {
+  for (const LazyCallGraph::Node &N : *C)
+    runAfterPass(&N.getFunction());
+}
+
+void PseudoProbeVerifier::runAfterPass(const Function *F) {
+  if (!shouldVerifyFunction(F))
+    return;
+  ProbeFactorMap ProbeFactors;
+  for (const auto &BB : *F)
+    collectProbeFactors(&BB, ProbeFactors);
+  verifyProbeFactors(F, ProbeFactors);
+}
+
+void PseudoProbeVerifier::runAfterPass(const Loop *L) {
+  const Function *F = L->getHeader()->getParent();
+  runAfterPass(F);
+}
+
+void PseudoProbeVerifier::collectProbeFactors(const BasicBlock *Block,
+                                              ProbeFactorMap &ProbeFactors) {
+  for (const auto &I : *Block) {
+    if (Optional<PseudoProbe> Probe = extractProbe(I))
+      ProbeFactors[Probe->Id] += Probe->Factor;
+  }
+}
+
+void PseudoProbeVerifier::verifyProbeFactors(
+    const Function *F, const ProbeFactorMap &ProbeFactors) {
+  bool BannerPrinted = false;
+  auto &PrevProbeFactors = FunctionProbeFactors[F->getName()];
+  for (const auto &I : ProbeFactors) {
+    float CurProbeFactor = I.second;
+    if (PrevProbeFactors.count(I.first)) {
+      float PrevProbeFactor = PrevProbeFactors[I.first];
+      if (std::abs(CurProbeFactor - PrevProbeFactor) >
+          DistributionFactorVariance) {
+        if (!BannerPrinted) {
+          dbgs() << "Function " << F->getName() << ":\n";
+          BannerPrinted = true;
+        }
+        dbgs() << "Probe " << I.first << "\tprevious factor "
+               << format("%0.2f", PrevProbeFactor) << "\tcurrent factor "
+               << format("%0.2f", CurProbeFactor) << "\n";
+      }
+    }
+
+    // Update
+    PrevProbeFactors[I.first] = I.second;
+  }
+}
+
 PseudoProbeManager::PseudoProbeManager(const Module &M) {
   if (NamedMDNode *FuncInfo = M.getNamedMetadata(PseudoProbeDescMetadataName)) {
     for (const auto *Operand : FuncInfo->operands()) {
@@ -201,7 +313,8 @@ void SampleProfileProber::instrumentOneFunc(Function &F, TargetMachine *TM) {
     Function *ProbeFn =
         llvm::Intrinsic::getDeclaration(M, Intrinsic::pseudoprobe);
     Value *Args[] = {Builder.getInt64(Guid), Builder.getInt64(Index),
-                     Builder.getInt32(0)};
+                     Builder.getInt32(0),
+                     Builder.getInt64(PseudoProbeFullDistributionFactor)};
     auto *Probe = Builder.CreateCall(ProbeFn, Args);
     AssignDebugLoc(Probe);
   }
@@ -219,7 +332,8 @@ void SampleProfileProber::instrumentOneFunc(Function &F, TargetMachine *TM) {
     // Levarge the 32-bit discriminator field of debug data to store the ID and
     // type of a callsite probe. This gets rid of the dependency on plumbing a
     // customized metadata through the codegen pipeline.
-    uint32_t V = PseudoProbeDwarfDiscriminator::packProbeData(Index, Type);
+    uint32_t V = PseudoProbeDwarfDiscriminator::packProbeData(
+        Index, Type, 0, PseudoProbeDwarfDiscriminator::FullDistributionFactor);
     if (auto DIL = Call->getDebugLoc()) {
       DIL = DIL->cloneWithDiscriminator(V);
       Call->setDebugLoc(DIL);
@@ -274,3 +388,47 @@ PreservedAnalyses SampleProfileProbePass::run(Module &M,
 
   return PreservedAnalyses::none();
 }
+
+void PseudoProbeUpdatePass::runOnFunction(Function &F,
+                                          FunctionAnalysisManager &FAM) {
+  BlockFrequencyInfo &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+  auto BBProfileCount = [&BFI](BasicBlock *BB) {
+    return BFI.getBlockProfileCount(BB)
+               ? BFI.getBlockProfileCount(BB).getValue()
+               : 0;
+  };
+
+  // Collect the sum of execution weight for each probe.
+  ProbeFactorMap ProbeFactors;
+  for (auto &Block : F) {
+    for (auto &I : Block) {
+      if (Optional<PseudoProbe> Probe = extractProbe(I))
+        ProbeFactors[Probe->Id] += BBProfileCount(&Block);
+    }
+  }
+
+  // Fix up over-counted probes.
+  for (auto &Block : F) {
+    for (auto &I : Block) {
+      if (Optional<PseudoProbe> Probe = extractProbe(I)) {
+        float Sum = ProbeFactors[Probe->Id];
+        if (Sum != 0)
+          setProbeDistributionFactor(I, BBProfileCount(&Block) / Sum);
+      }
+    }
+  }
+}
+
+PreservedAnalyses PseudoProbeUpdatePass::run(Module &M,
+                                             ModuleAnalysisManager &AM) {
+  if (UpdatePseudoProbe) {
+    for (auto &F : M) {
+      if (F.isDeclaration())
+        continue;
+      FunctionAnalysisManager &FAM =
+          AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+      runOnFunction(F, FAM);
+    }
+  }
+  return PreservedAnalyses::none();
+}
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-update.prof b/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-update.prof
new file mode 100644
index 000000000000..62f9bd5992e7
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-update.prof
@@ -0,0 +1,8 @@
+foo:3200:13
+ 1: 13
+ 2: 7
+ 3: 6
+ 4: 13
+ 5: 7
+ 6: 6
+ !CFGChecksum: 844530426352218
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-emit-inline.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-emit-inline.ll
index 7e3c7e8deda2..4f730ba09a3a 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-emit-inline.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-emit-inline.ll
@@ -11,14 +11,14 @@
 ; RUN: llvm-objdump --section-headers  %t4 | FileCheck %s --check-prefix=CHECK-OBJ
 
 define dso_local void @foo2() !dbg !7 {
-; CHECK-IL:  call void @llvm.pseudoprobe(i64 [[#GUID1:]], i64 1, i32 0), !dbg ![[#]]
+; CHECK-IL:  call void @llvm.pseudoprobe(i64 [[#GUID1:]], i64 1, i32 0, i64 -1), !dbg ![[#]]
 ; CHECK-ASM: .pseudoprobe	[[#GUID1:]] 1 0 0
   ret void, !dbg !10
 }
 
 define dso_local void @foo() #0 !dbg !11 {
-; CHECK-IL:  call void @llvm.pseudoprobe(i64 [[#GUID2:]], i64 1, i32 0), !dbg ![[#]]
-; CHECK-IL:  call void @llvm.pseudoprobe(i64 [[#GUID1]], i64 1, i32 0), !dbg ![[#DL1:]]
+; CHECK-IL:  call void @llvm.pseudoprobe(i64 [[#GUID2:]], i64 1, i32 0, i64 -1), !dbg ![[#]]
+; CHECK-IL:  call void @llvm.pseudoprobe(i64 [[#GUID1]], i64 1, i32 0, i64 -1), !dbg ![[#DL1:]]
 ; CHECK-ASM: .pseudoprobe	[[#GUID2:]] 1 0 0
 ; CHECK-ASM: .pseudoprobe	[[#GUID1]] 1 0 0 @ [[#GUID2]]:2
   call void @foo2(), !dbg !12
@@ -26,9 +26,9 @@ define dso_local void @foo() #0 !dbg !11 {
 }
 
 define dso_local i32 @entry() !dbg !14 {
-; CHECK-IL:  call void @llvm.pseudoprobe(i64 [[#GUID3:]], i64 1, i32 0), !dbg ![[#]]
-; CHECK-IL:  call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 1, i32 0), !dbg ![[#DL2:]]
-; CHECK-IL:  call void @llvm.pseudoprobe(i64 [[#GUID1]], i64 1, i32 0), !dbg ![[#DL3:]]
+; CHECK-IL:  call void @llvm.pseudoprobe(i64 [[#GUID3:]], i64 1, i32 0, i64 -1), !dbg ![[#]]
+; CHECK-IL:  call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 1, i32 0, i64 -1), !dbg ![[#DL2:]]
+; CHECK-IL:  call void @llvm.pseudoprobe(i64 [[#GUID1]], i64 1, i32 0, i64 -1), !dbg ![[#DL3:]]
 ; CHECK-ASM: .pseudoprobe	[[#GUID3:]] 1 0 0
 ; CHECK-ASM: .pseudoprobe	[[#GUID2]] 1 0 0 @ [[#GUID3]]:2
 ; CHECK-ASM: .pseudoprobe	[[#GUID1]] 1 0 0 @ [[#GUID3]]:2 @ [[#GUID2]]:2
@@ -41,13 +41,13 @@ define dso_local i32 @entry() !dbg !14 {
 ; CHECK-IL: ![[#SCOPE2:]] = distinct !DISubprogram(name: "foo"
 ; CHECK-IL: ![[#DL1]] = !DILocation(line: 3, column: 1,  scope: ![[#SCOPE1]], inlinedAt: ![[#INL1:]])
 ; CHECK-IL: ![[#INL1]] = distinct !DILocation(line: 7, column: 3, scope: ![[#BL1:]])
-;; A discriminator of 134217751 which is 0x8000017 in hexdecimal, stands for a direct call probe
-;; with an index of 2.
-; CHECK-IL: ![[#BL1]] = !DILexicalBlockFile(scope: ![[#SCOPE2]], file: !1, discriminator: 134217751)
+;; A discriminator of 186646551 which is 0xb200017 in hexdecimal, stands for a direct call probe
+;; with an index of 2 and a scale of 100%.
+; CHECK-IL: ![[#BL1]] = !DILexicalBlockFile(scope: ![[#SCOPE2]], file: !1, discriminator: 186646551)
 ; CHECK-IL: ![[#SCOPE3:]] = distinct !DISubprogram(name: "entry"
 ; CHECK-IL: ![[#DL2]] = !DILocation(line: 7, column: 3,  scope: ![[#SCOPE2]], inlinedAt: ![[#INL2:]])
 ; CHECK-IL: ![[#INL2]] = distinct !DILocation(line: 11, column: 3, scope: ![[#BL2:]])
-; CHECK-IL: ![[#BL2]] = !DILexicalBlockFile(scope: ![[#SCOPE3]], file: !1, discriminator: 134217751)
+; CHECK-IL: ![[#BL2]] = !DILexicalBlockFile(scope: ![[#SCOPE3]], file: !1, discriminator: 186646551)
 ; CHECK-IL: ![[#DL3]] = !DILocation(line: 3, column: 1,  scope: ![[#SCOPE1]], inlinedAt: ![[#INL3:]])
 ; CHECK-IL: ![[#INL3]] = distinct !DILocation(line: 7, column: 3,  scope: ![[#BL1]], inlinedAt: ![[#INL2]])
 
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-emit.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-emit.ll
index 2074b708380f..da5d46a32287 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-emit.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-emit.ll
@@ -11,32 +11,36 @@
 
 ;; Check the generation of pseudoprobe intrinsic call.
 
+@a = dso_local global i32 0, align 4
+
 define void @foo(i32 %x) !dbg !3 {
 bb0:
   %cmp = icmp eq i32 %x, 0
-; CHECK-IL: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 1, i32 0), !dbg ![[#FAKELINE:]]
+; CHECK-IL: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 1, i32 0, i64 -1), !dbg ![[#FAKELINE:]]
 ; CHECK-MIR: PSEUDO_PROBE [[#GUID:]], 1, 0, 0
 ; CHECK-ASM: .pseudoprobe	[[#GUID:]] 1 0 0
   br i1 %cmp, label %bb1, label %bb2
 
 bb1:
-; CHECK-IL: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 2, i32 0), !dbg ![[#FAKELINE]]
+; CHECK-IL: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 2, i32 0, i64 -1), !dbg ![[#FAKELINE]]
 ; CHECK-MIR: PSEUDO_PROBE [[#GUID]], 3, 0, 0
 ; CHECK-MIR: PSEUDO_PROBE [[#GUID]], 4, 0, 0
 ; CHECK-ASM: .pseudoprobe	[[#GUID]] 3 0 0
 ; CHECK-ASM: .pseudoprobe	[[#GUID]] 4 0 0
+  store i32 6, i32* @a, align 4
   br label %bb3
 
 bb2:
-; CHECK-IL: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 3, i32 0), !dbg ![[#FAKELINE]]
+; CHECK-IL: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 3, i32 0, i64 -1), !dbg ![[#FAKELINE]]
 ; CHECK-MIR: PSEUDO_PROBE [[#GUID]], 2, 0, 0
 ; CHECK-MIR: PSEUDO_PROBE [[#GUID]], 4, 0, 0
 ; CHECK-ASM: .pseudoprobe	[[#GUID]] 2 0 0
 ; CHECK-ASM: .pseudoprobe	[[#GUID]] 4 0 0
+  store i32 8, i32* @a, align 4
   br label %bb3
 
 bb3:
-; CHECK-IL: call void @llvm.pseudoprobe(i64 [[#GUID]], i64 4, i32 0), !dbg ![[#REALLINE:]]
+; CHECK-IL: call void @llvm.pseudoprobe(i64 [[#GUID]], i64 4, i32 0, i64 -1), !dbg ![[#REALLINE:]]
   ret void, !dbg !12
 }
 
@@ -44,7 +48,7 @@ declare void @bar(i32 %x)
 
 define internal void @foo2(void (i32)* %f) !dbg !4 {
 entry:
-; CHECK-IL: call void @llvm.pseudoprobe(i64 [[#GUID2:]], i64 1, i32 0)
+; CHECK-IL: call void @llvm.pseudoprobe(i64 [[#GUID2:]], i64 1, i32 0, i64 -1)
 ; CHECK-MIR: PSEUDO_PROBE [[#GUID2:]], 1, 0, 0
 ; CHECK-ASM: .pseudoprobe	[[#GUID2:]] 1 0 0
 ; Check pseudo_probe metadata attached to the indirect call instruction.
@@ -64,13 +68,13 @@ entry:
 ; CHECK-IL: ![[#FAKELINE]] = !DILocation(line: 0, scope: ![[#FOO]])
 ; CHECK-IL: ![[#REALLINE]] = !DILocation(line: 2, scope: ![[#FOO]])
 ; CHECK-IL: ![[#PROBE0]] = !DILocation(line: 2, column: 20, scope: ![[#SCOPE0:]])
-;; A discriminator of 67108887 which is 0x4000017 in hexdecimal, stands for a direct call probe
+;; A discriminator of 67108887 which is 0x7200017 in hexdecimal, stands for a direct call probe
 ;; with an index of 2.
-; CHECK-IL: ![[#SCOPE0]] = !DILexicalBlockFile(scope: ![[#]], file: ![[#]], discriminator: 67108887)
+; CHECK-IL: ![[#SCOPE0]] = !DILexicalBlockFile(scope: ![[#]], file: ![[#]], discriminator: 119537687)
 ; CHECK-IL: ![[#PROBE1]] = !DILocation(line: 0, scope: ![[#SCOPE1:]])
-;; A discriminator of 134217759 which is 0x800001f in hexdecimal, stands for a direct call probe
+;; A discriminator of 186646559 which is 0xb20001f in hexdecimal, stands for a direct call probe
 ;; with an index of 3.
-; CHECK-IL: ![[#SCOPE1]] = !DILexicalBlockFile(scope: ![[#]], file: ![[#]], discriminator: 134217759)
+; CHECK-IL: ![[#SCOPE1]] = !DILexicalBlockFile(scope: ![[#]], file: ![[#]], discriminator: 186646559)
 
 ; Check the generation of .pseudo_probe_desc section
 ; CHECK-ASM: .section .pseudo_probe_desc,"G",@progbits,.pseudo_probe_desc_foo,comdat
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll
index 5359fd4da067..055d41792290 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll
@@ -12,18 +12,18 @@ target triple = "x86_64-unknown-linux-gnu"
 
 define dso_local i32 @foo(i32 %x) #0 !dbg !12 {
 entry:
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID1:]], i64 1, i32 0)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID1:]], i64 1, i32 0, i64 -1)
   %add = add nsw i32 %x, 100000, !dbg !19
 ;; Check zen is fully inlined so there's no call to zen anymore.
 ;; Check code from the inlining of zen is properly annotated here.
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2:]], i64 1, i32 0)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2:]], i64 1, i32 0, i64 -1)
 ; CHECK: br i1 %cmp.i, label %while.cond.i, label %while.cond2.i, !dbg ![[#]], !prof ![[PD1:[0-9]+]]
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 2, i32 0)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 2, i32 0, i64 -1)
 ; CHECK: br i1 %cmp1.i, label %while.body.i, label %zen.exit, !dbg ![[#]], !prof ![[PD2:[0-9]+]]
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 3, i32 0)
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 4, i32 0)
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 5, i32 0)
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 6, i32 0)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 3, i32 0, i64 -1)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 4, i32 0, i64 -1)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 5, i32 0, i64 -1)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 6, i32 0, i64 -1)
 ; CHECK-NOT: call i32 @zen
   %call = call i32 @zen(i32 %add), !dbg !20
   ret i32 %call, !dbg !21
@@ -32,36 +32,36 @@ entry:
 ; CHECK: define dso_local i32 @zen
 define dso_local i32 @zen(i32 %x) #0 !dbg !22 {
 entry:
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 1, i32 0)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 1, i32 0, i64 -1)
   %cmp = icmp sgt i32 %x, 0, !dbg !26
   br i1 %cmp, label %while.cond, label %while.cond2, !dbg !28
 
 while.cond:
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 2, i32 0)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 2, i32 0, i64 -1)
   %x.addr.0 = phi i32 [ %x, %entry ], [ %sub, %while.body ]
   %cmp1 = icmp sgt i32 %x.addr.0, 0, !dbg !29
   br i1 %cmp1, label %while.body, label %if.end, !dbg !31
 
 while.body:
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 3, i32 0)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 3, i32 0, i64 -1)
   %0 = load volatile i32, i32* @factor, align 4, !dbg !32
   %sub = sub nsw i32 %x.addr.0, %0, !dbg !39
   br label %while.cond, !dbg !31
 
 while.cond2:
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 4, i32 0)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 4, i32 0, i64 -1)
   %x.addr.1 = phi i32 [ %x, %entry ], [ %add, %while.body4 ]
   %cmp3 = icmp slt i32 %x.addr.1, 0, !dbg !42
   br i1 %cmp3, label %while.body4, label %if.end, !dbg !44
 
 while.body4:
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 5, i32 0)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 5, i32 0, i64 -1)
   %1 = load volatile i32, i32* @factor, align 4, !dbg !45
   %add = add nsw i32 %x.addr.1, %1, !dbg !48
   br label %while.cond2, !dbg !44
 
 if.end:
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 6, i32 0)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID2]], i64 6, i32 0, i64 -1)
   %x.addr.2 = phi i32 [ %x.addr.0, %while.cond ], [ %x.addr.1, %while.cond2 ]
   ret i32 %x.addr.2, !dbg !51
 }
@@ -109,6 +109,10 @@ if.end:
 ;YAML-NEXT:    - NumSamples:      '23'
 ;YAML-NEXT:    - String:          ' samples from profile (ProbeId='
 ;YAML-NEXT:    - ProbeId:         '1'
+;YAML-NEXT:    - String:          ', Factor='
+;YAML-NEXT:    - Factor:          '1.000000e+00'
+;YAML-NEXT:    - String:          ', OriginalSamples='
+;YAML-NEXT:    - OriginalSamples: '23'
 ;YAML-NEXT:    - String:          ')'
 ;YAML-NEXT:  ...
 ;YAML:  --- !Analysis
@@ -121,6 +125,10 @@ if.end:
 ;YAML-NEXT:    - NumSamples:      '23'
 ;YAML-NEXT:    - String:          ' samples from profile (ProbeId='
 ;YAML-NEXT:    - ProbeId:         '1'
+;YAML-NEXT:    - String:          ', Factor='
+;YAML-NEXT:    - Factor:          '1.000000e+00'
+;YAML-NEXT:    - String:          ', OriginalSamples='
+;YAML-NEXT:    - OriginalSamples: '23'
 ;YAML-NEXT:    - String:          ')'
 ;YAML-NEXT:  ...
 ;YAML:  --- !Analysis
@@ -133,6 +141,10 @@ if.end:
 ;YAML-NEXT:    - NumSamples:      '382920'
 ;YAML-NEXT:    - String:          ' samples from profile (ProbeId='
 ;YAML-NEXT:    - ProbeId:         '2'
+;YAML-NEXT:    - String:          ', Factor='
+;YAML-NEXT:    - Factor:          '1.000000e+00'
+;YAML-NEXT:    - String:          ', OriginalSamples='
+;YAML-NEXT:    - OriginalSamples: '382920'
 ;YAML-NEXT:    - String:          ')'
 ;YAML-NEXT:  ...
 
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-profile.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-profile.ll
index 25fd04e9d710..34629a3743eb 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-profile.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-profile.ll
@@ -8,26 +8,26 @@ entry:
   store i32 %x, i32* %x.addr, align 4
   %0 = load i32, i32* %x.addr, align 4
   %cmp = icmp eq i32 %0, 0
-  ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 1, i32 0)
+  ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 1, i32 0, i64 -1)
   br i1 %cmp, label %if.then, label %if.else
   ; CHECK: br i1 %cmp, label %if.then, label %if.else, !prof ![[PD1:[0-9]+]]
 
 if.then:
   ; CHECK: call {{.*}}, !dbg ![[#PROBE1:]], !prof ![[PROF1:[0-9]+]]
   call void %f(i32 1)
-  ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 2, i32 0)
+  ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 2, i32 0, i64 -1)
   store i32 1, i32* %retval, align 4
   br label %return
 
 if.else:
   ; CHECK: call {{.*}}, !dbg ![[#PROBE2:]], !prof ![[PROF2:[0-9]+]]
   call void %f(i32 2)
-  ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 3, i32 0)
+  ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 3, i32 0, i64 -1)
   store i32 2, i32* %retval, align 4
   br label %return
 
 return:
-  ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0)
+  ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1)
   %1 = load i32, i32* %retval, align 4
   ret i32 %1
 }
@@ -36,14 +36,14 @@ attributes #0 = {"use-sample-profile"}
 
 ; CHECK: ![[PD1]] = !{!"branch_weights", i32 8, i32 7}
 ; CHECK: ![[#PROBE1]] = !DILocation(line: 0, scope: ![[#SCOPE1:]])
-;; A discriminator of 119537711 which is 0x400002f in hexdecimal, stands for an indirect call probe
+;; A discriminator of 119537711 which is 0x720002f in hexdecimal, stands for an indirect call probe
 ;; with an index of 5.
-; CHECK: ![[#SCOPE1]] = !DILexicalBlockFile(scope: ![[#]], file: ![[#]], discriminator: 67108911)
+; CHECK: ![[#SCOPE1]] = !DILexicalBlockFile(scope: ![[#]], file: ![[#]], discriminator: 119537711)
 ; CHECK: ![[PROF1]] = !{!"VP", i32 0, i64 7, i64 9191153033785521275, i64 5, i64 -1069303473483922844, i64 2}
-; CHECK: ![[#PROBE2]] = !DILocation(line: 0, scope: ![[#SCOPE2:]])
-;; A discriminator of 119537719 which is 0x4000037 in hexdecimal, stands for an indirect call probe
+;; A discriminator of 119537719 which is 0x7200037 in hexdecimal, stands for an indirect call probe
 ;; with an index of 6.
-; CHECK: ![[#SCOPE2]] = !DILexicalBlockFile(scope: ![[#]], file: ![[#]], discriminator: 67108919)
+; CHECK: ![[#PROBE2]] = !DILocation(line: 0, scope: ![[#SCOPE2:]])
+; CHECK: ![[#SCOPE2]] = !DILexicalBlockFile(scope: ![[#]], file: ![[#]], discriminator: 119537719)
 ; CHECK: ![[PROF2]] = !{!"VP", i32 0, i64 6, i64 -1069303473483922844, i64 4, i64 9191153033785521275, i64 2}
 
 !llvm.module.flags = !{!9, !10}
@@ -69,6 +69,10 @@ attributes #0 = {"use-sample-profile"}
 ;YAML-NEXT:    - NumSamples:      '13'
 ;YAML-NEXT:    - String:          ' samples from profile (ProbeId='
 ;YAML-NEXT:    - ProbeId:         '1'
+;YAML-NEXT:    - String:          ', Factor='
+;YAML-NEXT:    - Factor:          '1.000000e+00'
+;YAML-NEXT:    - String:          ', OriginalSamples='
+;YAML-NEXT:    - OriginalSamples: '13'
 ;YAML-NEXT:    - String:          ')'
 ;YAML:  --- !Analysis
 ;YAML-NEXT:  Pass:            sample-profile
@@ -80,6 +84,10 @@ attributes #0 = {"use-sample-profile"}
 ;YAML-NEXT:    - NumSamples:      '7'
 ;YAML-NEXT:    - String:          ' samples from profile (ProbeId='
 ;YAML-NEXT:    - ProbeId:         '5'
+;YAML-NEXT:    - String:          ', Factor='
+;YAML-NEXT:    - Factor:          '1.000000e+00'
+;YAML-NEXT:    - String:          ', OriginalSamples='
+;YAML-NEXT:    - OriginalSamples: '7'
 ;YAML-NEXT:    - String:          ')'
 ;YAML:  --- !Analysis
 ;YAML-NEXT:  Pass:            sample-profile
@@ -91,6 +99,10 @@ attributes #0 = {"use-sample-profile"}
 ;YAML-NEXT:    - NumSamples:      '7'
 ;YAML-NEXT:    - String:          ' samples from profile (ProbeId='
 ;YAML-NEXT:    - ProbeId:         '2'
+;YAML-NEXT:    - String:          ', Factor='
+;YAML-NEXT:    - Factor:          '1.000000e+00'
+;YAML-NEXT:    - String:          ', OriginalSamples='
+;YAML-NEXT:    - OriginalSamples: '7'
 ;YAML-NEXT:    - String:          ')'
 ;YAML:  --- !Analysis
 ;YAML-NEXT:  Pass:            sample-profile
@@ -102,6 +114,10 @@ attributes #0 = {"use-sample-profile"}
 ;YAML-NEXT:    - NumSamples:      '6'
 ;YAML-NEXT:    - String:          ' samples from profile (ProbeId='
 ;YAML-NEXT:    - ProbeId:         '6'
+;YAML-NEXT:    - String:          ', Factor='
+;YAML-NEXT:    - Factor:          '1.000000e+00'
+;YAML-NEXT:    - String:          ', OriginalSamples='
+;YAML-NEXT:    - OriginalSamples: '6'
 ;YAML-NEXT:    - String:          ')'
 ;YAML:  --- !Analysis
 ;YAML-NEXT:  Pass:            sample-profile
@@ -113,6 +129,10 @@ attributes #0 = {"use-sample-profile"}
 ;YAML-NEXT:    - NumSamples:      '6'
 ;YAML-NEXT:    - String:          ' samples from profile (ProbeId='
 ;YAML-NEXT:    - ProbeId:         '3'
+;YAML-NEXT:    - String:          ', Factor='
+;YAML-NEXT:    - Factor:          '1.000000e+00'
+;YAML-NEXT:    - String:          ', OriginalSamples='
+;YAML-NEXT:    - OriginalSamples: '6'
 ;YAML-NEXT:    - String:          ')'
 ;YAML:  --- !Analysis
 ;YAML-NEXT:  Pass:            sample-profile
@@ -124,4 +144,8 @@ attributes #0 = {"use-sample-profile"}
 ;YAML-NEXT:    - NumSamples:      '13'
 ;YAML-NEXT:    - String:          ' samples from profile (ProbeId='
 ;YAML-NEXT:    - ProbeId:         '4'
+;YAML-NEXT:    - String:          ', Factor='
+;YAML-NEXT:    - Factor:          '1.000000e+00'
+;YAML-NEXT:    - String:          ', OriginalSamples='
+;YAML-NEXT:    - OriginalSamples: '13'
 ;YAML-NEXT:    - String:          ')'
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-update.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-update.ll
new file mode 100644
index 000000000000..992afedd14f7
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-update.ll
@@ -0,0 +1,45 @@
+; RUN: opt < %s -passes='pseudo-probe,sample-profile,jump-threading,pseudo-probe-update' -sample-profile-file=%S/Inputs/pseudo-probe-update.prof -S  | FileCheck %s
+
+declare i32 @f1()
+declare i32 @f2()
+declare void @f3()
+
+
+;; This tests that the branch in 'merge' can be cloned up into T1.
+define i32 @foo(i1 %cond, i1 %cond2) #0 {
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 1, i32 0, i64 -1)
+	br i1 %cond, label %T1, label %F1
+T1:
+; CHECK: %v1 = call i32 @f1(), !prof ![[#PROF1:]]
+	%v1 = call i32 @f1()
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 2, i32 0, i64 -1)
+;; The distribution factor -8513881372706734080 stands for 53.85%, whic is from 7/6+7.
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -8513881372706734080)
+    %cond3 = icmp eq i32 %v1, 412
+	br label %Merge
+F1:
+; CHECK: %v2 = call i32 @f2(), !prof ![[#PROF2:]]
+	%v2 = call i32 @f2()
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 3, i32 0, i64 -1)
+;; The distribution factor 8513881922462547968 stands for 46.25%, which is from 6/6+7.
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 8513881922462547968)
+	br label %Merge
+Merge:
+
+	%A = phi i1 [%cond3, %T1], [%cond2, %F1]
+	%B = phi i32 [%v1, %T1], [%v2, %F1]
+	br i1 %A, label %T2, label %F2
+T2:
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 5, i32 0, i64 -1)
+	call void @f3()
+	ret i32 %B
+F2:
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 6, i32 0, i64 -1)
+	ret i32 %B
+}
+
+; CHECK: ![[#PROF1]] = !{!"branch_weights", i32 7}
+; CHECK: ![[#PROF2]] = !{!"branch_weights", i32 6}
+
+attributes #0 = {"use-sample-profile"}
+
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-verify.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-verify.ll
new file mode 100644
index 000000000000..fd57dd8bc526
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-verify.ll
@@ -0,0 +1,77 @@
+; REQUIRES: x86_64-linux
+; RUN: opt < %s -passes='pseudo-probe,loop-unroll-full' -verify-pseudo-probe -S -o %t 2>&1 | FileCheck %s --check-prefix=VERIFY
+; RUN: FileCheck %s < %t
+
+; VERIFY: *** Pseudo Probe Verification After LoopFullUnrollPass ***
+; VERIFY: Function foo:
+; VERIFY-DAG: Probe 6	previous factor 1.00	current factor 5.00
+; VERIFY-DAG: Probe 4	previous factor 1.00	current factor 5.00
+
+declare void @foo2() nounwind
+
+define void @foo(i32 %x) {
+bb:
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 1, i32 0, i64 -1)
+  %tmp = alloca [5 x i32*], align 16
+  br label %bb7.preheader
+
+bb3.loopexit:
+  %spec.select.lcssa = phi i32 [ %spec.select, %bb10 ]
+  %tmp5.not = icmp eq i32 %spec.select.lcssa, 0
+  br i1 %tmp5.not, label %bb24, label %bb7.preheader
+
+bb7.preheader:
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 3, i32 0, i64 -1)
+  %tmp1.06 = phi i32 [ 5, %bb ], [ %spec.select.lcssa, %bb3.loopexit ]
+  br label %bb10
+
+bb10:
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1)
+; CHECK: call void @foo2(), !dbg ![[#PROBE6:]] 
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1)
+; CHECK: call void @foo2(), !dbg ![[#PROBE6:]] 
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1)
+; CHECK: call void @foo2(), !dbg ![[#PROBE6:]] 
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1)
+; CHECK: call void @foo2(), !dbg ![[#PROBE6:]] 
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1)
+; CHECK: call void @foo2(), !dbg ![[#PROBE6:]] 
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 2, i32 0, i64 -1)
+  %indvars.iv = phi i64 [ 0, %bb7.preheader ], [ %indvars.iv.next, %bb10 ]
+  %tmp1.14 = phi i32 [ %tmp1.06, %bb7.preheader ], [ %spec.select, %bb10 ]
+  %tmp13 = getelementptr inbounds [5 x i32*], [5 x i32*]* %tmp, i64 0, i64 %indvars.iv
+  %tmp14 = load i32*, i32** %tmp13, align 8
+  %tmp15.not = icmp ne i32* %tmp14, null
+  %tmp18 = sext i1 %tmp15.not to i32
+  %spec.select = add nsw i32 %tmp1.14, %tmp18
+  call void @foo2(), !dbg !12
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 5
+  br i1 %exitcond.not, label %bb3.loopexit, label %bb10, !llvm.loop !13
+
+bb24:
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 5, i32 0, i64 -1)
+  ret void
+}
+
+;; A discriminator of 186646583 which is 0xb200037 in hexdecimal, stands for a direct call probe
+;; with an index of 6 and a scale of -1%.
+; CHECK: ![[#PROBE6]] = !DILocation(line: 2, column: 20, scope: ![[#SCOPE:]])
+; CHECK: ![[#SCOPE]] = !DILexicalBlockFile(scope: ![[#]], file: ![[#]], discriminator: 186646583)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9, !10}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0", isOptimized: false, runtimeVersion: 0, emissionKind: 1, enums: !2)
+!1 = !DIFile(filename: "test.c", directory: "")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 2, type: !5, isLocal: false, isDefinition: true, scopeLine: 2, isOptimized: false, unit: !0, retainedNodes: !2)
+!5 = !DISubroutineType(types: !6)
+!6 = !{!7}
+!7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!9 = !{i32 2, !"Dwarf Version", i32 4}
+!10 = !{i32 2, !"Debug Info Version", i32 3}
+!11 = !{!"clang version 3.9.0"}
+!12 = !DILocation(line: 2, column: 20, scope: !4)
+!13 = distinct !{!13, !14}
+!14 = !{!"llvm.loop.unroll.full"}

From ad2086658df181369a09ad69dac260a41dbab814 Mon Sep 17 00:00:00 2001
From: Shilei Tian <tianshilei1992@gmail.com>
Date: Wed, 3 Feb 2021 20:57:59 -0500
Subject: [PATCH 058/318] [OpenMP][NVPTX] Take functions in `deviceRTLs` as
 `convergent`

OpenMP device compiler (similar to other SPMD compilers) assumes that
functions are convergent by default to avoid invalid transformations, such as
the bug (https://bugs.llvm.org/show_bug.cgi?id=49021).

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D95971

(cherry picked from commit 0f0ce3c12edefd25448e39c4d20718a10d3d42c1)
---
 clang/lib/Frontend/CompilerInvocation.cpp     |  2 +
 .../OpenMP/target_attribute_convergent.cpp    | 13 +++
 .../libomptarget/test/offloading/bug49021.cpp | 85 +++++++++++++++++++
 3 files changed, 100 insertions(+)
 create mode 100644 clang/test/OpenMP/target_attribute_convergent.cpp
 create mode 100644 openmp/libomptarget/test/offloading/bug49021.cpp

diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index d8be4ea14868..036388ebd355 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -2470,6 +2470,8 @@ void CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
   bool IsTargetSpecified =
       Opts.OpenMPIsDevice || Args.hasArg(options::OPT_fopenmp_targets_EQ);
 
+  Opts.ConvergentFunctions = Opts.ConvergentFunctions || Opts.OpenMPIsDevice;
+
   if (Opts.OpenMP || Opts.OpenMPSimd) {
     if (int Version = getLastArgIntValue(
             Args, OPT_fopenmp_version_EQ,
diff --git a/clang/test/OpenMP/target_attribute_convergent.cpp b/clang/test/OpenMP/target_attribute_convergent.cpp
new file mode 100644
index 000000000000..932214e987c8
--- /dev/null
+++ b/clang/test/OpenMP/target_attribute_convergent.cpp
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -o - | FileCheck %s
+// RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -o - | FileCheck %s
+// expected-no-diagnostics
+
+#pragma omp declare target
+
+void foo() {}
+
+#pragma omp end declare target
+
+// CHECK: Function Attrs: {{.*}}convergent{{.*}}
+// CHECK: define hidden void @_Z3foov() [[ATTRIBUTE_NUMBER:#[0-9]+]]
+// CHECK: attributes [[ATTRIBUTE_NUMBER]] = { {{.*}}convergent{{.*}} }
diff --git a/openmp/libomptarget/test/offloading/bug49021.cpp b/openmp/libomptarget/test/offloading/bug49021.cpp
new file mode 100644
index 000000000000..bcdbf68b10e0
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/bug49021.cpp
@@ -0,0 +1,85 @@
+// RUN: %libomptarget-compilexx-aarch64-unknown-linux-gnu -O3 && %libomptarget-run-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compilexx-powerpc64-ibm-linux-gnu -O3 && %libomptarget-run-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-powerpc64le-ibm-linux-gnu -O3 && %libomptarget-run-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-x86_64-pc-linux-gnu -O3 && %libomptarget-run-x86_64-pc-linux-gnu
+// RUN: %libomptarget-compilexx-nvptx64-nvidia-cuda -O3 && %libomptarget-run-nvptx64-nvidia-cuda
+
+#include <iostream>
+
+template <typename T> int test_map() {
+  std::cout << "map(complex<>)" << std::endl;
+  T a(0.2), a_check;
+#pragma omp target map(from : a_check)
+  { a_check = a; }
+
+  if (a_check != a) {
+    std::cout << " wrong results";
+    return 1;
+  }
+
+  return 0;
+}
+
+template <typename T> int test_reduction() {
+  std::cout << "flat parallelism" << std::endl;
+  T sum(0), sum_host(0);
+  const int size = 100;
+  T array[size];
+  for (int i = 0; i < size; i++) {
+    array[i] = i;
+    sum_host += array[i];
+  }
+
+#pragma omp target teams distribute parallel for map(to: array[:size])         \
+                                                 reduction(+ : sum)
+  for (int i = 0; i < size; i++)
+    sum += array[i];
+
+  if (sum != sum_host)
+    std::cout << " wrong results " << sum << " host " << sum_host << std::endl;
+
+  std::cout << "hierarchical parallelism" << std::endl;
+  const int nblock(10), block_size(10);
+  T block_sum[nblock];
+#pragma omp target teams distribute map(to                                     \
+                                        : array[:size])                        \
+    map(from                                                                   \
+        : block_sum[:nblock])
+  for (int ib = 0; ib < nblock; ib++) {
+    T partial_sum = 0;
+    const int istart = ib * block_size;
+    const int iend = (ib + 1) * block_size;
+#pragma omp parallel for reduction(+ : partial_sum)
+    for (int i = istart; i < iend; i++)
+      partial_sum += array[i];
+    block_sum[ib] = partial_sum;
+  }
+
+  sum = 0;
+  for (int ib = 0; ib < nblock; ib++) {
+    sum += block_sum[ib];
+  }
+
+  if (sum != sum_host) {
+    std::cout << " wrong results " << sum << " host " << sum_host << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
+
+template <typename T> int test_complex() {
+  int ret = 0;
+  ret |= test_map<T>();
+  ret |= test_reduction<T>();
+  return ret;
+}
+
+int main() {
+  int ret = 0;
+  std::cout << "Testing float" << std::endl;
+  ret |= test_complex<float>();
+  std::cout << "Testing double" << std::endl;
+  ret |= test_complex<double>();
+  return ret;
+}

From e8cdcaeae406527c9a76b3dc5c522391c81dfdfd Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 1 Feb 2021 10:56:09 -0800
Subject: [PATCH 059/318] [X86] Accept 64-bit GPRs for vextractps when using a
 register that requires EVEX.

This is consistent with the VEX version. It also fixes a sorting
issue in the matching table that caused the EVEX version to be
prioritized over VEX in intel syntax.

Fixes issue [2] from PR48991.

(cherry picked from commit c691fe14da93a7c9eff466231515d6d4d16124fa)
---
 llvm/lib/Target/X86/X86InstrAVX512.td             | 4 ++--
 llvm/test/MC/X86/intel-syntax-x86-64-avx.s        | 4 ++++
 llvm/test/MC/X86/intel-syntax-x86-64-avx512f_vl.s | 3 +++
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 0c2b278fdd7b..19012797ae9a 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -1123,10 +1123,10 @@ defm : vextract_for_mask_cast<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info,
                               EXTRACT_get_vextract256_imm, [HasAVX512]>;
 
 // vextractps - extract 32 bits from XMM
-def VEXTRACTPSZrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst),
+def VEXTRACTPSZrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32orGR64:$dst),
       (ins VR128X:$src1, u8imm:$src2),
       "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-      [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>,
+      [(set GR32orGR64:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>,
       EVEX, VEX_WIG, Sched<[WriteVecExtract]>;
 
 def VEXTRACTPSZmr : AVX512AIi8<0x17, MRMDestMem, (outs),
diff --git a/llvm/test/MC/X86/intel-syntax-x86-64-avx.s b/llvm/test/MC/X86/intel-syntax-x86-64-avx.s
index bb57cb287f38..c1f20d204a8c 100644
--- a/llvm/test/MC/X86/intel-syntax-x86-64-avx.s
+++ b/llvm/test/MC/X86/intel-syntax-x86-64-avx.s
@@ -167,3 +167,7 @@
 // CHECK: vpmaddwd ymm1, ymm2, ymmword ptr [rcx + 8*r14 - 536870910]
 // CHECK: encoding: [0xc4,0xa1,0x6d,0xf5,0x8c,0xf1,0x02,0x00,0x00,0xe0]
           vpmaddwd ymm1, ymm2, ymmword ptr [rcx + 8*r14 - 536870910]
+
+// CHECK: vextractps ecx, xmm2, 1
+// CHECK: encoding: [0xc4,0xe3,0x79,0x17,0xd1,0x01]
+          vextractps ecx, xmm2, 1
diff --git a/llvm/test/MC/X86/intel-syntax-x86-64-avx512f_vl.s b/llvm/test/MC/X86/intel-syntax-x86-64-avx512f_vl.s
index 29bde03c5860..31c43afe5017 100644
--- a/llvm/test/MC/X86/intel-syntax-x86-64-avx512f_vl.s
+++ b/llvm/test/MC/X86/intel-syntax-x86-64-avx512f_vl.s
@@ -1260,3 +1260,6 @@
 // CHECK: encoding: [0x62,0xf1,0x7e,0x89,0xe6,0x11]
           vcvtdq2pd xmm2 {k1} {z}, qword ptr [rcx]
 
+// CHECK: vextractps ecx, xmm17, 1
+// CHECK: encoding: [0x62,0xe3,0x7d,0x08,0x17,0xc9,0x01]
+          vextractps rcx, xmm17, 1

From 7fad20eccc4f9fe5d03b2e381e26e8eb13a3e3be Mon Sep 17 00:00:00 2001
From: Shilei Tian <tianshilei1992@gmail.com>
Date: Thu, 4 Feb 2021 08:44:20 -0500
Subject: [PATCH 060/318] Revert "[OpenMP] Disabled profiling in `libomp` by
 default to unblock link errors"

This reverts commit f5602e0bf31ab590da19fa357980a753dbfd666e.
---
 openmp/CMakeLists.txt                 |  6 ------
 openmp/docs/design/Runtimes.rst       |  5 +----
 openmp/runtime/CMakeLists.txt         |  6 +++---
 openmp/runtime/src/CMakeLists.txt     | 12 +-----------
 openmp/runtime/src/kmp_config.h.cmake |  4 ++--
 openmp/runtime/src/kmp_runtime.cpp    |  6 +++---
 6 files changed, 10 insertions(+), 29 deletions(-)

diff --git a/openmp/CMakeLists.txt b/openmp/CMakeLists.txt
index 4787d4b5a321..67600bebdafb 100644
--- a/openmp/CMakeLists.txt
+++ b/openmp/CMakeLists.txt
@@ -86,12 +86,6 @@ option(OPENMP_ENABLE_LIBOMPTARGET "Enable building libomptarget for offloading."
        ${ENABLE_LIBOMPTARGET})
 option(OPENMP_ENABLE_LIBOMPTARGET_PROFILING "Enable time profiling for libomptarget."
        ${ENABLE_LIBOMPTARGET})
-option(OPENMP_ENABLE_LIBOMP_PROFILING "Enable time profiling for libomp." OFF)
-
-# Build host runtime library, after LIBOMPTARGET variables are set since they are needed
-# to enable time profiling support in the OpenMP runtime.
-add_subdirectory(runtime)
-
 if (OPENMP_ENABLE_LIBOMPTARGET)
   # Check that the library can actually be built.
   if (APPLE OR WIN32)
diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst
index ad36e43eccdc..016b88ba324b 100644
--- a/openmp/docs/design/Runtimes.rst
+++ b/openmp/docs/design/Runtimes.rst
@@ -48,10 +48,7 @@ similar to Clang's ``-ftime-trace`` option. This generates a JSON file based on
 `Speedscope App`_. Building this feature depends on the `LLVM Support Library`_
 for time trace output. Using this library is enabled by default when building
 using the CMake option ``OPENMP_ENABLE_LIBOMPTARGET_PROFILING``. The output will
-be saved to the filename specified by the environment variable. For multi-threaded
-applications, profiling in ``libomp`` is also needed. Setting the CMake option
-``OPENMP_ENABLE_LIBOMP_PROFILING=ON`` to enable the feature. Note that this will
-turn ``libomp`` into a C++ library.
+be saved to the filename specified by the environment variable.
 
 .. _`Chrome Tracing`: https://www.chromium.org/developers/how-tos/trace-event-profiling-tool
 
diff --git a/openmp/runtime/CMakeLists.txt b/openmp/runtime/CMakeLists.txt
index 8828ff8ef455..9fdd04f41646 100644
--- a/openmp/runtime/CMakeLists.txt
+++ b/openmp/runtime/CMakeLists.txt
@@ -34,6 +34,7 @@ if(${OPENMP_STANDALONE_BUILD})
   # Should assertions be enabled?  They are on by default.
   set(LIBOMP_ENABLE_ASSERTIONS TRUE CACHE BOOL
     "enable assertions?")
+  set(LIBOMPTARGET_PROFILING_SUPPORT FALSE)
 else() # Part of LLVM build
   # Determine the native architecture from LLVM.
   string(TOLOWER "${LLVM_TARGET_ARCH}" LIBOMP_NATIVE_ARCH)
@@ -65,11 +66,10 @@ else() # Part of LLVM build
     libomp_get_architecture(LIBOMP_ARCH)
   endif ()
   set(LIBOMP_ENABLE_ASSERTIONS ${LLVM_ENABLE_ASSERTIONS})
+  # Time profiling support
+  set(LIBOMPTARGET_PROFILING_SUPPORT ${OPENMP_ENABLE_LIBOMPTARGET_PROFILING})
 endif()
 
-# Time profiling support
-set(LIBOMP_PROFILING_SUPPORT ${OPENMP_ENABLE_LIBOMP_PROFILING})
-
 # FUJITSU A64FX is a special processor because its cache line size is 256.
 # We need to pass this information into kmp_config.h.
 if(LIBOMP_ARCH STREQUAL "aarch64")
diff --git a/openmp/runtime/src/CMakeLists.txt b/openmp/runtime/src/CMakeLists.txt
index 822f9ca2b825..2e927df84f5c 100644
--- a/openmp/runtime/src/CMakeLists.txt
+++ b/openmp/runtime/src/CMakeLists.txt
@@ -50,14 +50,6 @@ if(${LIBOMP_USE_HWLOC})
   include_directories(${LIBOMP_HWLOC_INSTALL_DIR}/include)
 endif()
 
-# Building with time profiling support requires LLVM directory includes.
-if(LIBOMP_PROFILING_SUPPORT)
-  include_directories(
-    ${LLVM_MAIN_INCLUDE_DIR}
-    ${LLVM_INCLUDE_DIR}
-  )
-endif()
-
 # Getting correct source files to build library
 set(LIBOMP_CXXFILES)
 set(LIBOMP_ASMFILES)
@@ -143,7 +135,7 @@ libomp_get_ldflags(LIBOMP_CONFIGURED_LDFLAGS)
 
 libomp_get_libflags(LIBOMP_CONFIGURED_LIBFLAGS)
 # Build libomp library. Add LLVMSupport dependency if building in-tree with libomptarget profiling enabled.
-if(OPENMP_STANDALONE_BUILD OR (NOT OPENMP_ENABLE_LIBOMP_PROFILING))
+if(OPENMP_STANDALONE_BUILD OR (NOT OPENMP_ENABLE_LIBOMPTARGET_PROFILING))
   add_library(omp ${LIBOMP_LIBRARY_KIND} ${LIBOMP_SOURCE_FILES})
   # Linking command will include libraries in LIBOMP_CONFIGURED_LIBFLAGS
   target_link_libraries(omp ${LIBOMP_CONFIGURED_LIBFLAGS} ${CMAKE_DL_LIBS})
@@ -152,8 +144,6 @@ else()
     LINK_LIBS ${LIBOMP_CONFIGURED_LIBFLAGS} ${CMAKE_DL_LIBS}
     LINK_COMPONENTS Support
     )
-  # libomp must be a C++ library such that it can link libLLVMSupport
-  set(LIBOMP_LINKER_LANGUAGE CXX)
 endif()
 
 set_target_properties(omp PROPERTIES
diff --git a/openmp/runtime/src/kmp_config.h.cmake b/openmp/runtime/src/kmp_config.h.cmake
index f6aee7197ee8..3d682c690fc7 100644
--- a/openmp/runtime/src/kmp_config.h.cmake
+++ b/openmp/runtime/src/kmp_config.h.cmake
@@ -44,8 +44,8 @@
 #define OMPT_DEBUG LIBOMP_OMPT_DEBUG
 #cmakedefine01 LIBOMP_OMPT_SUPPORT
 #define OMPT_SUPPORT LIBOMP_OMPT_SUPPORT
-#cmakedefine01 LIBOMP_PROFILING_SUPPORT
-#define OMP_PROFILING_SUPPORT LIBOMP_PROFILING_SUPPORT
+#cmakedefine01 LIBOMPTARGET_PROFILING_SUPPORT
+#define OMPTARGET_PROFILING_SUPPORT LIBOMPTARGET_PROFILING_SUPPORT
 #cmakedefine01 LIBOMP_OMPT_OPTIONAL
 #define OMPT_OPTIONAL LIBOMP_OMPT_OPTIONAL
 #cmakedefine01 LIBOMP_USE_ADAPTIVE_LOCKS
diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp
index a6e32bd008e1..4a0634d59cff 100644
--- a/openmp/runtime/src/kmp_runtime.cpp
+++ b/openmp/runtime/src/kmp_runtime.cpp
@@ -32,7 +32,7 @@
 #include "ompt-specific.h"
 #endif
 
-#if OMP_PROFILING_SUPPORT
+#if OMPTARGET_PROFILING_SUPPORT
 #include "llvm/Support/TimeProfiler.h"
 static char *ProfileTraceFile = nullptr;
 #endif
@@ -5740,7 +5740,7 @@ void __kmp_free_thread(kmp_info_t *this_th) {
 /* ------------------------------------------------------------------------ */
 
 void *__kmp_launch_thread(kmp_info_t *this_thr) {
-#if OMP_PROFILING_SUPPORT
+#if OMPTARGET_PROFILING_SUPPORT
   ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
   // TODO: add a configuration option for time granularity
   if (ProfileTraceFile)
@@ -5848,7 +5848,7 @@ void *__kmp_launch_thread(kmp_info_t *this_thr) {
   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
   KMP_MB();
 
-#if OMP_PROFILING_SUPPORT
+#if OMPTARGET_PROFILING_SUPPORT
   llvm::timeTraceProfilerFinishThread();
 #endif
   return this_thr;

From bc2dad1671598a87423c61c355d03db49ce76907 Mon Sep 17 00:00:00 2001
From: Peter Waller <peter.waller@arm.com>
Date: Tue, 26 Jan 2021 11:55:24 +0000
Subject: [PATCH 061/318] [clang][aarch64][WOA64][docs] Release note for
 longjmp crash with /guard:cf

Add a release note workaround for PR47463.

Bug: https://bugs.llvm.org/show_bug.cgi?id=47463

Differential Revision: https://reviews.llvm.org/D95435
---
 clang/docs/ReleaseNotes.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 9efd4c01f053..c17d84de320c 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -153,6 +153,11 @@ Windows Support
 - Implicitly add ``.exe`` suffix for MinGW targets, even when cross compiling.
   (This matches a change from GCC 8.)
 
+- Windows on Arm64: programs using the C standard library's setjmp and longjmp
+  functions may crash with a "Security check failure or stack buffer overrun"
+  exception. To workaround (with reduced security), compile with
+  /guard:cf,nolongjmp.
+
 C Language Changes in Clang
 ---------------------------
 

From 66c7b449acf402bdc87b69db5778b7b43958d217 Mon Sep 17 00:00:00 2001
From: Giorgis Georgakoudis <georgakoudis1@llnl.gov>
Date: Mon, 25 Jan 2021 14:10:50 -0800
Subject: [PATCH 062/318] [OpenMP] Fix building using LLVM_ENABLE_RUNTIMES

Fix when time profiling is enabled.

Related to: D94855

Reviewed By: JonChesterfield

Differential Revision: https://reviews.llvm.org/D95398

(cherry picked from commit bb40e6731843de92f1c73ad6efceb8a89e045ea6)
---
 openmp/CMakeLists.txt             | 10 +++++-----
 openmp/runtime/src/CMakeLists.txt |  9 +++++++++
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/openmp/CMakeLists.txt b/openmp/CMakeLists.txt
index 67600bebdafb..f89857dc98d6 100644
--- a/openmp/CMakeLists.txt
+++ b/openmp/CMakeLists.txt
@@ -55,11 +55,6 @@ set(OPENMP_TEST_FLAGS "" CACHE STRING
 set(OPENMP_TEST_OPENMP_FLAGS ${OPENMP_TEST_COMPILER_OPENMP_FLAGS} CACHE STRING
   "OpenMP compiler flag to use for testing OpenMP runtime libraries.")
 
-
-# Build host runtime library.
-add_subdirectory(runtime)
-
-
 set(ENABLE_LIBOMPTARGET ON)
 # Currently libomptarget cannot be compiled on Windows or MacOS X.
 # Since the device plugins are only supported on Linux anyway,
@@ -86,6 +81,11 @@ option(OPENMP_ENABLE_LIBOMPTARGET "Enable building libomptarget for offloading."
        ${ENABLE_LIBOMPTARGET})
 option(OPENMP_ENABLE_LIBOMPTARGET_PROFILING "Enable time profiling for libomptarget."
        ${ENABLE_LIBOMPTARGET})
+
+# Build host runtime library, after LIBOMPTARGET variables are set since they are needed
+# to enable time profiling support in the OpenMP runtime.
+add_subdirectory(runtime)
+
 if (OPENMP_ENABLE_LIBOMPTARGET)
   # Check that the library can actually be built.
   if (APPLE OR WIN32)
diff --git a/openmp/runtime/src/CMakeLists.txt b/openmp/runtime/src/CMakeLists.txt
index 2e927df84f5c..9c5dba55b705 100644
--- a/openmp/runtime/src/CMakeLists.txt
+++ b/openmp/runtime/src/CMakeLists.txt
@@ -50,6 +50,15 @@ if(${LIBOMP_USE_HWLOC})
   include_directories(${LIBOMP_HWLOC_INSTALL_DIR}/include)
 endif()
 
+# Building with time profiling support for libomptarget requires
+# LLVM directory includes.
+if(LIBOMPTARGET_PROFILING_SUPPORT)
+  include_directories(
+    ${LLVM_MAIN_INCLUDE_DIR}
+    ${LLVM_INCLUDE_DIR}
+  )
+endif()
+
 # Getting correct source files to build library
 set(LIBOMP_CXXFILES)
 set(LIBOMP_ASMFILES)

From 92a5106e8055bab7da46095a832904444862728b Mon Sep 17 00:00:00 2001
From: Shilei Tian <tianshilei1992@gmail.com>
Date: Thu, 28 Jan 2021 07:24:19 -0500
Subject: [PATCH 063/318] [OpenMP] Disabled profiling in `libomp` by default to
 unblock link errors

Link error occurred when time profiling in libomp is enabled by default
because `libomp` is assumed to be a C library but the dependence on
`libLLVMSupport` for profiling is a C++ library. Currently the issue blocks all
OpenMP tests in Phabricator.

This patch set a new CMake option `OPENMP_ENABLE_LIBOMP_PROFILING` to
enable/disable the feature. By default it is disabled. Note that once time
profiling is enabled for `libomp`, it becomes a C++ library.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D95585

(cherry picked from commit c571b168349fdf22d1dc8b920bcffa3d5161f0a2)
---
 openmp/CMakeLists.txt                 | 1 +
 openmp/docs/design/Runtimes.rst       | 5 ++++-
 openmp/runtime/CMakeLists.txt         | 6 +++---
 openmp/runtime/src/CMakeLists.txt     | 9 +++++----
 openmp/runtime/src/kmp_config.h.cmake | 4 ++--
 openmp/runtime/src/kmp_runtime.cpp    | 6 +++---
 6 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/openmp/CMakeLists.txt b/openmp/CMakeLists.txt
index f89857dc98d6..b8a2822877e3 100644
--- a/openmp/CMakeLists.txt
+++ b/openmp/CMakeLists.txt
@@ -81,6 +81,7 @@ option(OPENMP_ENABLE_LIBOMPTARGET "Enable building libomptarget for offloading."
        ${ENABLE_LIBOMPTARGET})
 option(OPENMP_ENABLE_LIBOMPTARGET_PROFILING "Enable time profiling for libomptarget."
        ${ENABLE_LIBOMPTARGET})
+option(OPENMP_ENABLE_LIBOMP_PROFILING "Enable time profiling for libomp." OFF)
 
 # Build host runtime library, after LIBOMPTARGET variables are set since they are needed
 # to enable time profiling support in the OpenMP runtime.
diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst
index 016b88ba324b..ad36e43eccdc 100644
--- a/openmp/docs/design/Runtimes.rst
+++ b/openmp/docs/design/Runtimes.rst
@@ -48,7 +48,10 @@ similar to Clang's ``-ftime-trace`` option. This generates a JSON file based on
 `Speedscope App`_. Building this feature depends on the `LLVM Support Library`_
 for time trace output. Using this library is enabled by default when building
 using the CMake option ``OPENMP_ENABLE_LIBOMPTARGET_PROFILING``. The output will
-be saved to the filename specified by the environment variable.
+be saved to the filename specified by the environment variable. For multi-threaded
+applications, profiling in ``libomp`` is also needed. Setting the CMake option
+``OPENMP_ENABLE_LIBOMP_PROFILING=ON`` to enable the feature. Note that this will
+turn ``libomp`` into a C++ library.
 
 .. _`Chrome Tracing`: https://www.chromium.org/developers/how-tos/trace-event-profiling-tool
 
diff --git a/openmp/runtime/CMakeLists.txt b/openmp/runtime/CMakeLists.txt
index 9fdd04f41646..8828ff8ef455 100644
--- a/openmp/runtime/CMakeLists.txt
+++ b/openmp/runtime/CMakeLists.txt
@@ -34,7 +34,6 @@ if(${OPENMP_STANDALONE_BUILD})
   # Should assertions be enabled?  They are on by default.
   set(LIBOMP_ENABLE_ASSERTIONS TRUE CACHE BOOL
     "enable assertions?")
-  set(LIBOMPTARGET_PROFILING_SUPPORT FALSE)
 else() # Part of LLVM build
   # Determine the native architecture from LLVM.
   string(TOLOWER "${LLVM_TARGET_ARCH}" LIBOMP_NATIVE_ARCH)
@@ -66,10 +65,11 @@ else() # Part of LLVM build
     libomp_get_architecture(LIBOMP_ARCH)
   endif ()
   set(LIBOMP_ENABLE_ASSERTIONS ${LLVM_ENABLE_ASSERTIONS})
-  # Time profiling support
-  set(LIBOMPTARGET_PROFILING_SUPPORT ${OPENMP_ENABLE_LIBOMPTARGET_PROFILING})
 endif()
 
+# Time profiling support
+set(LIBOMP_PROFILING_SUPPORT ${OPENMP_ENABLE_LIBOMP_PROFILING})
+
 # FUJITSU A64FX is a special processor because its cache line size is 256.
 # We need to pass this information into kmp_config.h.
 if(LIBOMP_ARCH STREQUAL "aarch64")
diff --git a/openmp/runtime/src/CMakeLists.txt b/openmp/runtime/src/CMakeLists.txt
index 9c5dba55b705..822f9ca2b825 100644
--- a/openmp/runtime/src/CMakeLists.txt
+++ b/openmp/runtime/src/CMakeLists.txt
@@ -50,9 +50,8 @@ if(${LIBOMP_USE_HWLOC})
   include_directories(${LIBOMP_HWLOC_INSTALL_DIR}/include)
 endif()
 
-# Building with time profiling support for libomptarget requires
-# LLVM directory includes.
-if(LIBOMPTARGET_PROFILING_SUPPORT)
+# Building with time profiling support requires LLVM directory includes.
+if(LIBOMP_PROFILING_SUPPORT)
   include_directories(
     ${LLVM_MAIN_INCLUDE_DIR}
     ${LLVM_INCLUDE_DIR}
@@ -144,7 +143,7 @@ libomp_get_ldflags(LIBOMP_CONFIGURED_LDFLAGS)
 
 libomp_get_libflags(LIBOMP_CONFIGURED_LIBFLAGS)
 # Build libomp library. Add LLVMSupport dependency if building in-tree with libomptarget profiling enabled.
-if(OPENMP_STANDALONE_BUILD OR (NOT OPENMP_ENABLE_LIBOMPTARGET_PROFILING))
+if(OPENMP_STANDALONE_BUILD OR (NOT OPENMP_ENABLE_LIBOMP_PROFILING))
   add_library(omp ${LIBOMP_LIBRARY_KIND} ${LIBOMP_SOURCE_FILES})
   # Linking command will include libraries in LIBOMP_CONFIGURED_LIBFLAGS
   target_link_libraries(omp ${LIBOMP_CONFIGURED_LIBFLAGS} ${CMAKE_DL_LIBS})
@@ -153,6 +152,8 @@ else()
     LINK_LIBS ${LIBOMP_CONFIGURED_LIBFLAGS} ${CMAKE_DL_LIBS}
     LINK_COMPONENTS Support
     )
+  # libomp must be a C++ library such that it can link libLLVMSupport
+  set(LIBOMP_LINKER_LANGUAGE CXX)
 endif()
 
 set_target_properties(omp PROPERTIES
diff --git a/openmp/runtime/src/kmp_config.h.cmake b/openmp/runtime/src/kmp_config.h.cmake
index 3d682c690fc7..f6aee7197ee8 100644
--- a/openmp/runtime/src/kmp_config.h.cmake
+++ b/openmp/runtime/src/kmp_config.h.cmake
@@ -44,8 +44,8 @@
 #define OMPT_DEBUG LIBOMP_OMPT_DEBUG
 #cmakedefine01 LIBOMP_OMPT_SUPPORT
 #define OMPT_SUPPORT LIBOMP_OMPT_SUPPORT
-#cmakedefine01 LIBOMPTARGET_PROFILING_SUPPORT
-#define OMPTARGET_PROFILING_SUPPORT LIBOMPTARGET_PROFILING_SUPPORT
+#cmakedefine01 LIBOMP_PROFILING_SUPPORT
+#define OMP_PROFILING_SUPPORT LIBOMP_PROFILING_SUPPORT
 #cmakedefine01 LIBOMP_OMPT_OPTIONAL
 #define OMPT_OPTIONAL LIBOMP_OMPT_OPTIONAL
 #cmakedefine01 LIBOMP_USE_ADAPTIVE_LOCKS
diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp
index 4a0634d59cff..a6e32bd008e1 100644
--- a/openmp/runtime/src/kmp_runtime.cpp
+++ b/openmp/runtime/src/kmp_runtime.cpp
@@ -32,7 +32,7 @@
 #include "ompt-specific.h"
 #endif
 
-#if OMPTARGET_PROFILING_SUPPORT
+#if OMP_PROFILING_SUPPORT
 #include "llvm/Support/TimeProfiler.h"
 static char *ProfileTraceFile = nullptr;
 #endif
@@ -5740,7 +5740,7 @@ void __kmp_free_thread(kmp_info_t *this_th) {
 /* ------------------------------------------------------------------------ */
 
 void *__kmp_launch_thread(kmp_info_t *this_thr) {
-#if OMPTARGET_PROFILING_SUPPORT
+#if OMP_PROFILING_SUPPORT
   ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
   // TODO: add a configuration option for time granularity
   if (ProfileTraceFile)
@@ -5848,7 +5848,7 @@ void *__kmp_launch_thread(kmp_info_t *this_thr) {
   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
   KMP_MB();
 
-#if OMPTARGET_PROFILING_SUPPORT
+#if OMP_PROFILING_SUPPORT
   llvm::timeTraceProfilerFinishThread();
 #endif
   return this_thr;

From 72f12467ded52160d52025e13a6217f00fe25f68 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Thu, 4 Feb 2021 13:26:59 +0100
Subject: [PATCH 064/318] Add a release note about deprecating the clang-cl
 /fallback flag

As discussed in
https://lists.llvm.org/pipermail/cfe-dev/2021-January/067524.html

The flag has been removed on the main branch in D95876.

Differential revision: https://reviews.llvm.org/D96016
---
 clang/docs/ReleaseNotes.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index c17d84de320c..f4ca8a855142 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -109,6 +109,10 @@ Deprecated Compiler Flags
 The following options are deprecated and ignored. They will be removed in
 future versions of Clang.
 
+- The clang-cl ``/fallback`` flag, which made clang-cl invoke Microsoft Visual
+  C++ on files it couldn't compile itself, has been deprecated. It will be
+  removed in Clang 13.
+
 - ...
 
 Modified Compiler Flags

From 4e7933905578456a30b281bbbe832d8d938feed0 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Fri, 5 Feb 2021 01:40:33 +0000
Subject: [PATCH 065/318] workflows: Update libclang-abi-tests to work with
 minor release baselines

---
 .github/workflows/libclang-abi-tests.yml | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/libclang-abi-tests.yml b/.github/workflows/libclang-abi-tests.yml
index 5681c7c8166e..320a88c1d407 100644
--- a/.github/workflows/libclang-abi-tests.yml
+++ b/.github/workflows/libclang-abi-tests.yml
@@ -20,6 +20,7 @@ jobs:
       ABI_HEADERS: ${{ steps.vars.outputs.ABI_HEADERS }}
       ABI_LIBS: ${{ steps.vars.outputs.ABI_LIBS }}
       BASELINE_VERSION_MAJOR: ${{ steps.vars.outputs.BASELINE_VERSION_MAJOR }}
+      BASELINE_VERSION_MINOR: ${{ steps.vars.outputs.BASELINE_VERSION_MINOR }}
       LLVM_VERSION_MAJOR: ${{ steps.version.outputs.LLVM_VERSION_MAJOR }}
       LLVM_VERSION_MINOR: ${{ steps.version.outputs.LLVM_VERSION_MINOR }}
       LLVM_VERSION_PATCH: ${{ steps.version.outputs.LLVM_VERSION_PATCH }}
@@ -36,16 +37,35 @@ jobs:
       - name: Setup Variables
         id: vars
         run: |
+          minor_version=0
+          remote_repo='https://github.com/llvm/llvm-project'
           if [ ${{ steps.version.outputs.LLVM_VERSION_MINOR }} -ne 0 -o ${{ steps.version.outputs.LLVM_VERSION_PATCH }} -eq 0 ]; then
-            echo ::set-output name=BASELINE_VERSION_MAJOR::$(( ${{ steps.version.outputs.LLVM_VERSION_MAJOR }} - 1))
+            major_version=$(( ${{ steps.version.outputs.LLVM_VERSION_MAJOR }} - 1))
+            baseline_ref="$major_version.0.0"
+
+            # If there is a minor release, we want to use that as the base line.
+            minor_ref=`git ls-remote --refs -t $remote_repo llvmorg-$major_version.[1-9].[0-9] | tail -n1 | grep -o 'llvmorg-.\+' || true`
+            if [ -n "$minor_ref" ]; then
+               baseline_ref=$minor_ref
+            else
+              # Check if we have a release candidate
+              rc_ref=`git ls-remote --refs -t $remote_repo llvmorg-$major_version.[1-9].[0-9]-rc* | tail -n1 | grep -o 'llvmorg-.\+' || true`
+              if [ -n "$rc_ref" ]; then
+                baseline_ref=$rc_ref
+              fi
+            fi
+            echo ::set-output name=BASELINE_VERSION_MAJOR::$major_version
+            echo ::set-output name=BASELINE_REF::$baseline_ref
             echo ::set-output name=ABI_HEADERS::clang-c
             echo ::set-output name=ABI_LIBS::libclang.so
           else
             echo ::set-output name=BASELINE_VERSION_MAJOR::${{ steps.version.outputs.LLVM_VERSION_MAJOR }}
+            echo ::set-output name=BASELINE_REF::${{ steps.version.outputs.LLVM_VERSION_MAJOR }}.0.0
             echo ::set-output name=ABI_HEADERS::.
             echo ::set-output name=ABI_LIBS::libclang.so libclang-cpp.so
           fi
 
+
   abi-dump:
     needs: abi-dump-setup
     runs-on: ubuntu-latest
@@ -57,7 +77,7 @@ jobs:
         include:
           - name: build-baseline
             llvm_version_major: ${{ needs.abi-dump-setup.outputs.BASELINE_VERSION_MAJOR }}
-            ref: llvmorg-${{ needs.abi-dump-setup.outputs.BASELINE_VERSION_MAJOR }}.0.0
+            ref: ${{ needs.abi-dump-setup.outputs.BASELINE_REF }}
             repo: llvm/llvm-project
           - name: build-latest
             llvm_version_major: ${{ needs.abi-dump-setup.outputs.LLVM_VERSION_MAJOR }}

From 81febec8a327ecbe83575ac280c2931718ab5e33 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Fri, 29 Jan 2021 12:56:23 +0100
Subject: [PATCH 066/318] [MemCpyOpt] Add test for incorrect optimization
 across lifetime (NFC)

This only affects the MemorySSA-based implementation.
---
 llvm/test/Transforms/MemCpyOpt/lifetime.ll | 43 ++++++++++++++++++++--
 1 file changed, 39 insertions(+), 4 deletions(-)

diff --git a/llvm/test/Transforms/MemCpyOpt/lifetime.ll b/llvm/test/Transforms/MemCpyOpt/lifetime.ll
index 1d2b699ee96d..5dc13ca10054 100644
--- a/llvm/test/Transforms/MemCpyOpt/lifetime.ll
+++ b/llvm/test/Transforms/MemCpyOpt/lifetime.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -O2 -S -enable-memcpyopt-memoryssa=0 | FileCheck %s
-; RUN: opt < %s -O2 -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s
+; RUN: opt < %s -O2 -S -enable-memcpyopt-memoryssa=0 | FileCheck %s --check-prefixes=CHECK,NO_MSSA
+; RUN: opt < %s -O2 -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s --check-prefixes=CHECK,MSSA
 
 ; performCallSlotOptzn in MemCpy should not exchange the calls to
 ; @llvm.lifetime.start and @llvm.memcpy.
@@ -9,8 +9,8 @@ declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i
 declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
 declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 
-define void @_ZN4CordC2EOS_(i8* nocapture dereferenceable(16) %arg1) {
-; CHECK-LABEL: @_ZN4CordC2EOS_(
+define void @call_slot(i8* nocapture dereferenceable(16) %arg1) {
+; CHECK-LABEL: @call_slot(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[TMP_SROA_3_0_ARG1_SROA_RAW_IDX:%.*]] = getelementptr inbounds i8, i8* [[ARG1:%.*]], i64 7
 ; CHECK-NEXT:    store i8 0, i8* [[TMP_SROA_3_0_ARG1_SROA_RAW_IDX]], align 1
@@ -27,4 +27,39 @@ bb:
   ret void
 }
 
+; FIXME: Miscompile.
+define void @memcpy_memcpy_across_lifetime(i8* noalias %p1, i8* noalias %p2, i8* noalias %p3) {
+; NO_MSSA-LABEL: @memcpy_memcpy_across_lifetime(
+; NO_MSSA-NEXT:    [[A:%.*]] = alloca [16 x i8], align 1
+; NO_MSSA-NEXT:    [[A8:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[A]], i64 0, i64 0
+; NO_MSSA-NEXT:    call void @llvm.lifetime.start.p0i8(i64 16, i8* nonnull [[A8]])
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(16) [[A8]], i8* nonnull align 1 dereferenceable(16) [[P1:%.*]], i64 16, i1 false)
+; NO_MSSA-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(16) [[P1]], i8* nonnull align 1 dereferenceable(16) [[P2:%.*]], i64 16, i1 false)
+; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(16) [[P2]], i8* nonnull align 1 dereferenceable(16) [[A8]], i64 16, i1 false)
+; NO_MSSA-NEXT:    call void @llvm.lifetime.end.p0i8(i64 16, i8* nonnull [[A8]])
+; NO_MSSA-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(16) [[P3:%.*]], i8* nonnull align 1 dereferenceable(16) [[P2]], i64 16, i1 false)
+; NO_MSSA-NEXT:    ret void
+;
+; MSSA-LABEL: @memcpy_memcpy_across_lifetime(
+; MSSA-NEXT:    [[A:%.*]] = alloca [16 x i8], align 1
+; MSSA-NEXT:    [[A8:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[A]], i64 0, i64 0
+; MSSA-NEXT:    call void @llvm.lifetime.start.p0i8(i64 16, i8* nonnull [[A8]])
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(16) [[A8]], i8* nonnull align 1 dereferenceable(16) [[P1:%.*]], i64 16, i1 false)
+; MSSA-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(16) [[P1]], i8* nonnull align 1 dereferenceable(16) [[P2:%.*]], i64 16, i1 false)
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(16) [[P2]], i8* nonnull align 1 dereferenceable(16) [[A8]], i64 16, i1 false)
+; MSSA-NEXT:    call void @llvm.lifetime.end.p0i8(i64 16, i8* nonnull [[A8]])
+; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(16) [[P3:%.*]], i8* nonnull align 1 dereferenceable(16) [[A8]], i64 16, i1 false)
+; MSSA-NEXT:    ret void
+;
+  %a = alloca [16 x i8]
+  %a8 = bitcast [16 x i8]* %a to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %a8)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a8, i8* %p1, i64 16, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %p1, i8* %p2, i64 16, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %p2, i8* %a8, i64 16, i1 false)
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %a8)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %p3, i8* %p2, i64 16, i1 false)
+  ret void
+}
+
 attributes #1 = { argmemonly nounwind }

From 12a772b1a09a1b5c3f43d08c2804973506b8a859 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Sun, 31 Jan 2021 17:55:24 +0100
Subject: [PATCH 067/318] [MemorySSA] Don't treat lifetime.end as NoAlias

MemorySSA currently treats lifetime.end intrinsics as not aliasing
anything. This breaks MemorySSA-based MemCpyOpt, because we'll happily
move a read of a pointer below a lifetime.end intrinsic, as no clobber
is reported.

I think the MemorySSA modelling here isn't correct: lifetime.end(p)
has approximately the same effect as doing a memcpy(p, undef), and
should be treated as a clobber.

This patch removes the special handling of lifetime.end, leaving
alias analysis to handle it appropriately.

Differential Revision: https://reviews.llvm.org/D95763
---
 llvm/lib/Analysis/MemorySSA.cpp               | 26 --------------
 .../Analysis/MemorySSA/lifetime-simple.ll     |  9 +++--
 llvm/test/Transforms/MemCpyOpt/lifetime.ll    | 36 +++++++------------
 3 files changed, 16 insertions(+), 55 deletions(-)

diff --git a/llvm/lib/Analysis/MemorySSA.cpp b/llvm/lib/Analysis/MemorySSA.cpp
index 52dca7d378e1..4722b68e20e9 100644
--- a/llvm/lib/Analysis/MemorySSA.cpp
+++ b/llvm/lib/Analysis/MemorySSA.cpp
@@ -281,7 +281,6 @@ instructionClobbersQuery(const MemoryDef *MD, const MemoryLocation &UseLoc,
     // clobbers where they don't really exist at all. Please see D43269 for
     // context.
     switch (II->getIntrinsicID()) {
-    case Intrinsic::lifetime_end:
     case Intrinsic::invariant_start:
     case Intrinsic::invariant_end:
     case Intrinsic::assume:
@@ -358,22 +357,6 @@ struct UpwardsMemoryQuery {
 
 } // end anonymous namespace
 
-static bool lifetimeEndsAt(MemoryDef *MD, const MemoryLocation &Loc,
-                           BatchAAResults &AA) {
-  Instruction *Inst = MD->getMemoryInst();
-  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
-    switch (II->getIntrinsicID()) {
-    case Intrinsic::lifetime_end: {
-      MemoryLocation ArgLoc = MemoryLocation::getAfter(II->getArgOperand(1));
-      return AA.alias(ArgLoc, Loc) == MustAlias;
-    }
-    default:
-      return false;
-    }
-  }
-  return false;
-}
-
 template <typename AliasAnalysisType>
 static bool isUseTriviallyOptimizableToLiveOnEntry(AliasAnalysisType &AA,
                                                    const Instruction *I) {
@@ -1465,15 +1448,6 @@ void MemorySSA::OptimizeUses::optimizeUsesInBlock(
       }
 
       MemoryDef *MD = cast<MemoryDef>(VersionStack[UpperBound]);
-      // If the lifetime of the pointer ends at this instruction, it's live on
-      // entry.
-      if (!UseMLOC.IsCall && lifetimeEndsAt(MD, UseMLOC.getLoc(), *AA)) {
-        // Reset UpperBound to liveOnEntryDef's place in the stack
-        UpperBound = 0;
-        FoundClobberResult = true;
-        LocInfo.AR = MustAlias;
-        break;
-      }
       ClobberAlias CA = instructionClobbersQuery(MD, MU, UseMLOC, *AA);
       if (CA.IsClobber) {
         FoundClobberResult = true;
diff --git a/llvm/test/Analysis/MemorySSA/lifetime-simple.ll b/llvm/test/Analysis/MemorySSA/lifetime-simple.ll
index 33327c5539f6..2d0481c18415 100644
--- a/llvm/test/Analysis/MemorySSA/lifetime-simple.ll
+++ b/llvm/test/Analysis/MemorySSA/lifetime-simple.ll
@@ -1,8 +1,7 @@
 ; RUN: opt -basic-aa -print-memoryssa -verify-memoryssa -enable-new-pm=0 -analyze < %s 2>&1 | FileCheck %s
 ; RUN: opt -aa-pipeline=basic-aa -passes='print<memoryssa>,verify<memoryssa>' -disable-output < %s 2>&1 | FileCheck %s
-; This test checks a number of things:
-; First, the lifetime markers should not clobber any uses of Q or P.
-; Second, the loads of P are MemoryUse(LiveOnEntry) due to the placement of the markers vs the loads.
+; This test checks that lifetime markers are considered clobbers of %P,
+; and due to lack of noalias information, of %Q as well.
 
 define i8 @test(i8* %P, i8* %Q) {
 entry:
@@ -18,10 +17,10 @@ entry:
 ; CHECK:  3 = MemoryDef(2)
 ; CHECK-NEXT:   call void @llvm.lifetime.end.p0i8(i64 32, i8* %P)
   call void @llvm.lifetime.end.p0i8(i64 32, i8* %P)
-; CHECK:  MemoryUse(liveOnEntry)
+; CHECK:  MemoryUse(3)
 ; CHECK-NEXT:   %1 = load i8, i8* %P
   %1 = load i8, i8* %P
-; CHECK:  MemoryUse(2)
+; CHECK:  MemoryUse(3)
 ; CHECK-NEXT:   %2 = load i8, i8* %Q
   %2 = load i8, i8* %Q
   ret i8 %1
diff --git a/llvm/test/Transforms/MemCpyOpt/lifetime.ll b/llvm/test/Transforms/MemCpyOpt/lifetime.ll
index 5dc13ca10054..c7e7666307ab 100644
--- a/llvm/test/Transforms/MemCpyOpt/lifetime.ll
+++ b/llvm/test/Transforms/MemCpyOpt/lifetime.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -O2 -S -enable-memcpyopt-memoryssa=0 | FileCheck %s --check-prefixes=CHECK,NO_MSSA
-; RUN: opt < %s -O2 -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s --check-prefixes=CHECK,MSSA
+; RUN: opt < %s -O2 -S -enable-memcpyopt-memoryssa=0 | FileCheck %s
+; RUN: opt < %s -O2 -S -enable-memcpyopt-memoryssa=1 -verify-memoryssa | FileCheck %s
 
 ; performCallSlotOptzn in MemCpy should not exchange the calls to
 ; @llvm.lifetime.start and @llvm.memcpy.
@@ -27,29 +27,17 @@ bb:
   ret void
 }
 
-; FIXME: Miscompile.
 define void @memcpy_memcpy_across_lifetime(i8* noalias %p1, i8* noalias %p2, i8* noalias %p3) {
-; NO_MSSA-LABEL: @memcpy_memcpy_across_lifetime(
-; NO_MSSA-NEXT:    [[A:%.*]] = alloca [16 x i8], align 1
-; NO_MSSA-NEXT:    [[A8:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[A]], i64 0, i64 0
-; NO_MSSA-NEXT:    call void @llvm.lifetime.start.p0i8(i64 16, i8* nonnull [[A8]])
-; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(16) [[A8]], i8* nonnull align 1 dereferenceable(16) [[P1:%.*]], i64 16, i1 false)
-; NO_MSSA-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(16) [[P1]], i8* nonnull align 1 dereferenceable(16) [[P2:%.*]], i64 16, i1 false)
-; NO_MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(16) [[P2]], i8* nonnull align 1 dereferenceable(16) [[A8]], i64 16, i1 false)
-; NO_MSSA-NEXT:    call void @llvm.lifetime.end.p0i8(i64 16, i8* nonnull [[A8]])
-; NO_MSSA-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(16) [[P3:%.*]], i8* nonnull align 1 dereferenceable(16) [[P2]], i64 16, i1 false)
-; NO_MSSA-NEXT:    ret void
-;
-; MSSA-LABEL: @memcpy_memcpy_across_lifetime(
-; MSSA-NEXT:    [[A:%.*]] = alloca [16 x i8], align 1
-; MSSA-NEXT:    [[A8:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[A]], i64 0, i64 0
-; MSSA-NEXT:    call void @llvm.lifetime.start.p0i8(i64 16, i8* nonnull [[A8]])
-; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(16) [[A8]], i8* nonnull align 1 dereferenceable(16) [[P1:%.*]], i64 16, i1 false)
-; MSSA-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(16) [[P1]], i8* nonnull align 1 dereferenceable(16) [[P2:%.*]], i64 16, i1 false)
-; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(16) [[P2]], i8* nonnull align 1 dereferenceable(16) [[A8]], i64 16, i1 false)
-; MSSA-NEXT:    call void @llvm.lifetime.end.p0i8(i64 16, i8* nonnull [[A8]])
-; MSSA-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(16) [[P3:%.*]], i8* nonnull align 1 dereferenceable(16) [[A8]], i64 16, i1 false)
-; MSSA-NEXT:    ret void
+; CHECK-LABEL: @memcpy_memcpy_across_lifetime(
+; CHECK-NEXT:    [[A:%.*]] = alloca [16 x i8], align 1
+; CHECK-NEXT:    [[A8:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[A]], i64 0, i64 0
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 16, i8* nonnull [[A8]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(16) [[A8]], i8* nonnull align 1 dereferenceable(16) [[P1:%.*]], i64 16, i1 false)
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(16) [[P1]], i8* nonnull align 1 dereferenceable(16) [[P2:%.*]], i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(16) [[P2]], i8* nonnull align 1 dereferenceable(16) [[A8]], i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 16, i8* nonnull [[A8]])
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(16) [[P3:%.*]], i8* nonnull align 1 dereferenceable(16) [[P2]], i64 16, i1 false)
+; CHECK-NEXT:    ret void
 ;
   %a = alloca [16 x i8]
   %a8 = bitcast [16 x i8]* %a to i8*

From 716eef9ad5b367e5cbcc22c8ac53395f9bdbe7a5 Mon Sep 17 00:00:00 2001
From: Shilei Tian <tianshilei1992@gmail.com>
Date: Thu, 4 Feb 2021 20:14:14 -0500
Subject: [PATCH 068/318] [OpenMP][libomptarget] Fixed an issue that device
 sync is skipped if the kernel doesn't have any argument

Currently if there is not kernel argument, device synchronization will
be skipped. This can lead to two issues:
1. If there is any device error, it will not be captured;
2. The target region might end before the kernel is done, which is not spec
   conformant.

The test added in this patch only runs on NVPTX platform, although it will not
be executed by Phab at all. It also requires `not` which is not available on most
systems.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D96067

(cherry picked from commit b68a6b09e60a24733b923a0fc282746a855852da)
---
 openmp/libomptarget/src/omptarget.cpp         | 22 +++++++++++++++----
 .../libomptarget/test/offloading/assert.cpp   |  8 +++++++
 2 files changed, 26 insertions(+), 4 deletions(-)
 create mode 100644 openmp/libomptarget/test/offloading/assert.cpp

diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 90966d25fb26..e4b7b18bc70b 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -451,6 +451,17 @@ struct DeallocTgtPtrInfo {
       : HstPtrBegin(HstPtr), DataSize(Size), ForceDelete(ForceDelete),
         HasCloseModifier(HasCloseModifier) {}
 };
+
+/// Synchronize device
+static int syncDevice(DeviceTy &Device, __tgt_async_info *AsyncInfo) {
+  assert(AsyncInfo && AsyncInfo->Queue && "Invalid AsyncInfo");
+  if (Device.synchronize(AsyncInfo) != OFFLOAD_SUCCESS) {
+    REPORT("Failed to synchronize device.\n");
+    return OFFLOAD_FAIL;
+  }
+
+  return OFFLOAD_SUCCESS;
+}
 } // namespace
 
 /// Internal function to undo the mapping and retrieve the data from the device.
@@ -631,11 +642,9 @@ int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
   // AsyncInfo->Queue will not be nullptr, so again, we don't need to
   // synchronize.
   if (AsyncInfo && AsyncInfo->Queue) {
-    Ret = Device.synchronize(AsyncInfo);
-    if (Ret != OFFLOAD_SUCCESS) {
-      REPORT("Failed to synchronize device.\n");
+    Ret = syncDevice(Device, AsyncInfo);
+    if (Ret != OFFLOAD_SUCCESS)
       return OFFLOAD_FAIL;
-    }
   }
 
   // Deallocate target pointer
@@ -1307,6 +1316,11 @@ int target(ident_t *loc, int64_t DeviceId, void *HostPtr, int32_t ArgNum,
       REPORT("Failed to process data after launching the kernel.\n");
       return OFFLOAD_FAIL;
     }
+  } else if (AsyncInfo.Queue) {
+    // If ArgNum is zero, but AsyncInfo.Queue is valid, then the kernel doesn't
+    // hava any argument, and the device supports async operations, so we need a
+    // sync at this point.
+    return syncDevice(Device, &AsyncInfo);
   }
 
   return OFFLOAD_SUCCESS;
diff --git a/openmp/libomptarget/test/offloading/assert.cpp b/openmp/libomptarget/test/offloading/assert.cpp
new file mode 100644
index 000000000000..00112dd92cc6
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/assert.cpp
@@ -0,0 +1,8 @@
+// RUN: %libomptarget-compilexx-nvptx64-nvidia-cuda && %libomptarget-run-fail-nvptx64-nvidia-cuda
+
+int main(int argc, char *argv[]) {
+#pragma omp target
+  { __builtin_trap(); }
+
+  return 0;
+}

From 395ef8d5c67905646b72dd5ef2d8eb60cabb8634 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Tue, 2 Feb 2021 16:58:38 -0500
Subject: [PATCH 069/318] =?UTF-8?q?[=F0=9F=8D=92][libc++]=20Rename=20inclu?=
 =?UTF-8?q?de/support=20to=20include/=5F=5Fsupport?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We do ship those headers, so the directory name should not be something
that can potentially conflict with user-defined directories.

This is a cherry-pick of b51756819a85563ae063e98eeb3d6af8e44c8f64.

Differential Revision: https://reviews.llvm.org/D96059
---
 libcxx/include/CMakeLists.txt                 | 38 +++++++++----------
 libcxx/include/__locale                       | 20 +++++-----
 .../android/locale_bionic.h                   |  6 +--
 .../{support => __support}/fuchsia/xlocale.h  |  6 +--
 .../{support => __support}/ibm/limits.h       |  2 +-
 .../ibm/locale_mgmt_aix.h                     |  2 +-
 .../{support => __support}/ibm/nanosleep.h    |  0
 .../{support => __support}/ibm/support.h      |  2 +-
 .../{support => __support}/ibm/xlocale.h      |  7 ++--
 .../{support => __support}/musl/xlocale.h     |  2 +-
 .../{support => __support}/newlib/xlocale.h   |  6 +--
 .../{support => __support}/nuttx/xlocale.h    |  6 +--
 .../{support => __support}/openbsd/xlocale.h  |  4 +-
 .../solaris/floatingpoint.h                   |  0
 .../{support => __support}/solaris/wchar.h    |  0
 .../{support => __support}/solaris/xlocale.h  |  0
 .../win32/limits_msvc_win32.h                 |  2 +-
 .../win32/locale_win32.h                      |  2 +-
 .../xlocale/__nop_locale_mgmt.h               |  2 +-
 .../xlocale/__posix_l_fallback.h              |  2 +-
 .../xlocale/__strtonum_fallback.h             |  2 +-
 libcxx/include/__threading_support            |  2 +-
 libcxx/include/bit                            |  2 +-
 libcxx/include/limits                         |  4 +-
 libcxx/src/CMakeLists.txt                     |  2 +-
 libcxx/src/locale.cpp                         |  2 +-
 libcxx/src/support/solaris/xlocale.cpp        |  2 +-
 libcxx/src/support/win32/locale_win32.cpp     |  2 +-
 libcxx/src/support/win32/support.cpp          |  2 +-
 libcxx/src/support/win32/thread_win32.cpp     |  2 +-
 .../gn/secondary/libcxx/include/BUILD.gn      | 38 +++++++++----------
 31 files changed, 85 insertions(+), 84 deletions(-)
 rename libcxx/include/{support => __support}/android/locale_bionic.h (90%)
 rename libcxx/include/{support => __support}/fuchsia/xlocale.h (74%)
 rename libcxx/include/{support => __support}/ibm/limits.h (97%)
 rename libcxx/include/{support => __support}/ibm/locale_mgmt_aix.h (96%)
 rename libcxx/include/{support => __support}/ibm/nanosleep.h (100%)
 rename libcxx/include/{support => __support}/ibm/support.h (95%)
 rename libcxx/include/{support => __support}/ibm/xlocale.h (97%)
 rename libcxx/include/{support => __support}/musl/xlocale.h (95%)
 rename libcxx/include/{support => __support}/newlib/xlocale.h (82%)
 rename libcxx/include/{support => __support}/nuttx/xlocale.h (70%)
 rename libcxx/include/{support => __support}/openbsd/xlocale.h (78%)
 rename libcxx/include/{support => __support}/solaris/floatingpoint.h (100%)
 rename libcxx/include/{support => __support}/solaris/wchar.h (100%)
 rename libcxx/include/{support => __support}/solaris/xlocale.h (100%)
 rename libcxx/include/{support => __support}/win32/limits_msvc_win32.h (96%)
 rename libcxx/include/{support => __support}/win32/locale_win32.h (99%)
 rename libcxx/include/{support => __support}/xlocale/__nop_locale_mgmt.h (94%)
 rename libcxx/include/{support => __support}/xlocale/__posix_l_fallback.h (98%)
 rename libcxx/include/{support => __support}/xlocale/__strtonum_fallback.h (96%)

diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 77e5e556d684..29a317b8ae9a 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -150,25 +150,25 @@ set(files
   string.h
   string_view
   strstream
-  support/android/locale_bionic.h
-  support/fuchsia/xlocale.h
-  support/ibm/limits.h
-  support/ibm/locale_mgmt_aix.h
-  support/ibm/nanosleep.h
-  support/ibm/support.h
-  support/ibm/xlocale.h
-  support/musl/xlocale.h
-  support/newlib/xlocale.h
-  support/nuttx/xlocale.h
-  support/openbsd/xlocale.h
-  support/solaris/floatingpoint.h
-  support/solaris/wchar.h
-  support/solaris/xlocale.h
-  support/win32/limits_msvc_win32.h
-  support/win32/locale_win32.h
-  support/xlocale/__nop_locale_mgmt.h
-  support/xlocale/__posix_l_fallback.h
-  support/xlocale/__strtonum_fallback.h
+  __support/android/locale_bionic.h
+  __support/fuchsia/xlocale.h
+  __support/ibm/limits.h
+  __support/ibm/locale_mgmt_aix.h
+  __support/ibm/nanosleep.h
+  __support/ibm/support.h
+  __support/ibm/xlocale.h
+  __support/musl/xlocale.h
+  __support/newlib/xlocale.h
+  __support/nuttx/xlocale.h
+  __support/openbsd/xlocale.h
+  __support/solaris/floatingpoint.h
+  __support/solaris/wchar.h
+  __support/solaris/xlocale.h
+  __support/win32/limits_msvc_win32.h
+  __support/win32/locale_win32.h
+  __support/xlocale/__nop_locale_mgmt.h
+  __support/xlocale/__posix_l_fallback.h
+  __support/xlocale/__strtonum_fallback.h
   system_error
   tgmath.h
   thread
diff --git a/libcxx/include/__locale b/libcxx/include/__locale
index a2da7d78049f..77e5faab2676 100644
--- a/libcxx/include/__locale
+++ b/libcxx/include/__locale
@@ -21,30 +21,30 @@
 #include <locale.h>
 #if defined(_LIBCPP_MSVCRT_LIKE)
 # include <cstring>
-# include <support/win32/locale_win32.h>
+# include <__support/win32/locale_win32.h>
 #elif defined(__NuttX__)
-# include <support/nuttx/xlocale.h>
+# include <__support/nuttx/xlocale.h>
 #elif defined(_AIX) || defined(__MVS__)
-# include <support/ibm/xlocale.h>
+# include <__support/ibm/xlocale.h>
 #elif defined(__ANDROID__)
-# include <support/android/locale_bionic.h>
+# include <__support/android/locale_bionic.h>
 #elif defined(__sun__)
 # include <xlocale.h>
-# include <support/solaris/xlocale.h>
+# include <__support/solaris/xlocale.h>
 #elif defined(_NEWLIB_VERSION)
-# include <support/newlib/xlocale.h>
+# include <__support/newlib/xlocale.h>
 #elif defined(__OpenBSD__)
-# include <support/openbsd/xlocale.h>
+# include <__support/openbsd/xlocale.h>
 #elif (defined(__APPLE__)      || defined(__FreeBSD__) \
     || defined(__EMSCRIPTEN__) || defined(__IBMCPP__))
 # include <xlocale.h>
 #elif defined(__Fuchsia__)
-# include <support/fuchsia/xlocale.h>
+# include <__support/fuchsia/xlocale.h>
 #elif defined(__wasi__)
 // WASI libc uses musl's locales support.
-# include <support/musl/xlocale.h>
+# include <__support/musl/xlocale.h>
 #elif defined(_LIBCPP_HAS_MUSL_LIBC)
-# include <support/musl/xlocale.h>
+# include <__support/musl/xlocale.h>
 #endif
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/support/android/locale_bionic.h b/libcxx/include/__support/android/locale_bionic.h
similarity index 90%
rename from libcxx/include/support/android/locale_bionic.h
rename to libcxx/include/__support/android/locale_bionic.h
index f05a6a0522ca..8c6d4bd0dc32 100644
--- a/libcxx/include/support/android/locale_bionic.h
+++ b/libcxx/include/__support/android/locale_bionic.h
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-//===------------------- support/android/locale_bionic.h ------------------===//
+//===-----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -28,13 +28,13 @@ extern "C" {
 #include <android/api-level.h>
 #include <android/ndk-version.h>
 #if __ANDROID_API__ < 21
-#include <support/xlocale/__posix_l_fallback.h>
+#include <__support/xlocale/__posix_l_fallback.h>
 #endif
 // In NDK versions later than 16, locale-aware functions are provided by
 // legacy_stdlib_inlines.h
 #if __NDK_MAJOR__ <= 16
 #if __ANDROID_API__ < 21
-#include <support/xlocale/__strtonum_fallback.h>
+#include <__support/xlocale/__strtonum_fallback.h>
 #elif __ANDROID_API__ < 26
 
 #if defined(__cplusplus)
diff --git a/libcxx/include/support/fuchsia/xlocale.h b/libcxx/include/__support/fuchsia/xlocale.h
similarity index 74%
rename from libcxx/include/support/fuchsia/xlocale.h
rename to libcxx/include/__support/fuchsia/xlocale.h
index b86ce9efbd11..e8def81480ea 100644
--- a/libcxx/include/support/fuchsia/xlocale.h
+++ b/libcxx/include/__support/fuchsia/xlocale.h
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-//===------------------- support/fuchsia/xlocale.h ------------------------===//
+//===-----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -14,8 +14,8 @@
 
 #include <cstdlib>
 #include <cwchar>
-#include <support/xlocale/__posix_l_fallback.h>
-#include <support/xlocale/__strtonum_fallback.h>
+#include <__support/xlocale/__posix_l_fallback.h>
+#include <__support/xlocale/__strtonum_fallback.h>
 
 #endif // defined(__Fuchsia__)
 
diff --git a/libcxx/include/support/ibm/limits.h b/libcxx/include/__support/ibm/limits.h
similarity index 97%
rename from libcxx/include/support/ibm/limits.h
rename to libcxx/include/__support/ibm/limits.h
index d1c59f066a87..45f1f1e3684c 100644
--- a/libcxx/include/support/ibm/limits.h
+++ b/libcxx/include/__support/ibm/limits.h
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-//===--------------------- support/ibm/limits.h ---------------------------===//
+//===-----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/libcxx/include/support/ibm/locale_mgmt_aix.h b/libcxx/include/__support/ibm/locale_mgmt_aix.h
similarity index 96%
rename from libcxx/include/support/ibm/locale_mgmt_aix.h
rename to libcxx/include/__support/ibm/locale_mgmt_aix.h
index e452dc32529d..4f658c3eee30 100644
--- a/libcxx/include/support/ibm/locale_mgmt_aix.h
+++ b/libcxx/include/__support/ibm/locale_mgmt_aix.h
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-//===------------------- support/ibm/locale_mgmt_aix.h --------------------===//
+//===-----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/libcxx/include/support/ibm/nanosleep.h b/libcxx/include/__support/ibm/nanosleep.h
similarity index 100%
rename from libcxx/include/support/ibm/nanosleep.h
rename to libcxx/include/__support/ibm/nanosleep.h
diff --git a/libcxx/include/support/ibm/support.h b/libcxx/include/__support/ibm/support.h
similarity index 95%
rename from libcxx/include/support/ibm/support.h
rename to libcxx/include/__support/ibm/support.h
index 0569cbe7460d..a7751b017666 100644
--- a/libcxx/include/support/ibm/support.h
+++ b/libcxx/include/__support/ibm/support.h
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-//===----------------------- support/ibm/support.h ----------------------===//
+//===-----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/libcxx/include/support/ibm/xlocale.h b/libcxx/include/__support/ibm/xlocale.h
similarity index 97%
rename from libcxx/include/support/ibm/xlocale.h
rename to libcxx/include/__support/ibm/xlocale.h
index fde137cde260..ad07a255fc95 100644
--- a/libcxx/include/support/ibm/xlocale.h
+++ b/libcxx/include/__support/ibm/xlocale.h
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-//===--------------------- support/ibm/xlocale.h -------------------===//
+//===-----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,7 +9,8 @@
 
 #ifndef _LIBCPP_SUPPORT_IBM_XLOCALE_H
 #define _LIBCPP_SUPPORT_IBM_XLOCALE_H
-#include <support/ibm/locale_mgmt_aix.h>
+
+#include <__support/ibm/locale_mgmt_aix.h>
 
 #include "cstdlib"
 
@@ -218,7 +219,7 @@ size_t strftime_l(char *__s, size_t __size, const char *__fmt,
 #elif defined(__MVS__)
 #include <wctype.h>
 // POSIX routines
-#include <support/xlocale/__posix_l_fallback.h>
+#include <__support/xlocale/__posix_l_fallback.h>
 #endif // defined(__MVS__)
 
 // The following are not POSIX routines.  These are quick-and-dirty hacks
diff --git a/libcxx/include/support/musl/xlocale.h b/libcxx/include/__support/musl/xlocale.h
similarity index 95%
rename from libcxx/include/support/musl/xlocale.h
rename to libcxx/include/__support/musl/xlocale.h
index 722d13fa1d66..2508a8e8e0ca 100644
--- a/libcxx/include/support/musl/xlocale.h
+++ b/libcxx/include/__support/musl/xlocale.h
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-//===------------------- support/musl/xlocale.h ------------------------===//
+//===-----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/libcxx/include/support/newlib/xlocale.h b/libcxx/include/__support/newlib/xlocale.h
similarity index 82%
rename from libcxx/include/support/newlib/xlocale.h
rename to libcxx/include/__support/newlib/xlocale.h
index 25fa798b6d02..b75f9263a4c4 100644
--- a/libcxx/include/support/newlib/xlocale.h
+++ b/libcxx/include/__support/newlib/xlocale.h
@@ -17,9 +17,9 @@
 #include <ctype.h>
 #if !defined(__NEWLIB__) || __NEWLIB__ < 2 || \
     __NEWLIB__ == 2 && __NEWLIB_MINOR__ < 5
-#include <support/xlocale/__nop_locale_mgmt.h>
-#include <support/xlocale/__posix_l_fallback.h>
-#include <support/xlocale/__strtonum_fallback.h>
+#include <__support/xlocale/__nop_locale_mgmt.h>
+#include <__support/xlocale/__posix_l_fallback.h>
+#include <__support/xlocale/__strtonum_fallback.h>
 #endif
 
 #endif // _NEWLIB_VERSION
diff --git a/libcxx/include/support/nuttx/xlocale.h b/libcxx/include/__support/nuttx/xlocale.h
similarity index 70%
rename from libcxx/include/support/nuttx/xlocale.h
rename to libcxx/include/__support/nuttx/xlocale.h
index b70d62005046..be738e3b64e4 100644
--- a/libcxx/include/support/nuttx/xlocale.h
+++ b/libcxx/include/__support/nuttx/xlocale.h
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-//===-------------------- support/nuttx/xlocale.h -------------------------===//
+//===-----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -11,8 +11,8 @@
 #define _LIBCPP_SUPPORT_NUTTX_XLOCALE_H
 
 #if defined(__NuttX__)
-#include <support/xlocale/__posix_l_fallback.h>
-#include <support/xlocale/__strtonum_fallback.h>
+#include <__support/xlocale/__posix_l_fallback.h>
+#include <__support/xlocale/__strtonum_fallback.h>
 #endif // __NuttX__
 
 #endif
diff --git a/libcxx/include/support/openbsd/xlocale.h b/libcxx/include/__support/openbsd/xlocale.h
similarity index 78%
rename from libcxx/include/support/openbsd/xlocale.h
rename to libcxx/include/__support/openbsd/xlocale.h
index fbfaedd127c6..1136fa327fac 100644
--- a/libcxx/include/support/openbsd/xlocale.h
+++ b/libcxx/include/__support/openbsd/xlocale.h
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-//===-------------------- support/openbsd/xlocale.h -----------------------===//
+//===-----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -14,6 +14,6 @@
 #include <clocale>
 #include <cwctype>
 #include <ctype.h>
-#include <support/xlocale/__strtonum_fallback.h>
+#include <__support/xlocale/__strtonum_fallback.h>
 
 #endif
diff --git a/libcxx/include/support/solaris/floatingpoint.h b/libcxx/include/__support/solaris/floatingpoint.h
similarity index 100%
rename from libcxx/include/support/solaris/floatingpoint.h
rename to libcxx/include/__support/solaris/floatingpoint.h
diff --git a/libcxx/include/support/solaris/wchar.h b/libcxx/include/__support/solaris/wchar.h
similarity index 100%
rename from libcxx/include/support/solaris/wchar.h
rename to libcxx/include/__support/solaris/wchar.h
diff --git a/libcxx/include/support/solaris/xlocale.h b/libcxx/include/__support/solaris/xlocale.h
similarity index 100%
rename from libcxx/include/support/solaris/xlocale.h
rename to libcxx/include/__support/solaris/xlocale.h
diff --git a/libcxx/include/support/win32/limits_msvc_win32.h b/libcxx/include/__support/win32/limits_msvc_win32.h
similarity index 96%
rename from libcxx/include/support/win32/limits_msvc_win32.h
rename to libcxx/include/__support/win32/limits_msvc_win32.h
index 7bb835559a3b..758d24647b1b 100644
--- a/libcxx/include/support/win32/limits_msvc_win32.h
+++ b/libcxx/include/__support/win32/limits_msvc_win32.h
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-//===------------------ support/win32/limits_msvc_win32.h -----------------===//
+//===-----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/libcxx/include/support/win32/locale_win32.h b/libcxx/include/__support/win32/locale_win32.h
similarity index 99%
rename from libcxx/include/support/win32/locale_win32.h
rename to libcxx/include/__support/win32/locale_win32.h
index 897c36be70c6..d32a7a8ad304 100644
--- a/libcxx/include/support/win32/locale_win32.h
+++ b/libcxx/include/__support/win32/locale_win32.h
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-//===--------------------- support/win32/locale_win32.h -------------------===//
+//===-----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/libcxx/include/support/xlocale/__nop_locale_mgmt.h b/libcxx/include/__support/xlocale/__nop_locale_mgmt.h
similarity index 94%
rename from libcxx/include/support/xlocale/__nop_locale_mgmt.h
rename to libcxx/include/__support/xlocale/__nop_locale_mgmt.h
index f33d3894c3a9..57b18842ff45 100644
--- a/libcxx/include/support/xlocale/__nop_locale_mgmt.h
+++ b/libcxx/include/__support/xlocale/__nop_locale_mgmt.h
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-//===------------  support/xlocale/__nop_locale_mgmt.h -----------------===//
+//===-----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/libcxx/include/support/xlocale/__posix_l_fallback.h b/libcxx/include/__support/xlocale/__posix_l_fallback.h
similarity index 98%
rename from libcxx/include/support/xlocale/__posix_l_fallback.h
rename to libcxx/include/__support/xlocale/__posix_l_fallback.h
index f3df6c46fbab..00d69d19e8c8 100644
--- a/libcxx/include/support/xlocale/__posix_l_fallback.h
+++ b/libcxx/include/__support/xlocale/__posix_l_fallback.h
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-//===--------------- support/xlocale/__posix_l_fallback.h -----------------===//
+//===-----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/libcxx/include/support/xlocale/__strtonum_fallback.h b/libcxx/include/__support/xlocale/__strtonum_fallback.h
similarity index 96%
rename from libcxx/include/support/xlocale/__strtonum_fallback.h
rename to libcxx/include/__support/xlocale/__strtonum_fallback.h
index df38598056a6..1172a5d57236 100644
--- a/libcxx/include/support/xlocale/__strtonum_fallback.h
+++ b/libcxx/include/__support/xlocale/__strtonum_fallback.h
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-//===-------------- support/xlocale/__strtonum_fallback.h -----------------===//
+//===-----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/libcxx/include/__threading_support b/libcxx/include/__threading_support
index 473c9c3bbe49..de572f3ff84d 100644
--- a/libcxx/include/__threading_support
+++ b/libcxx/include/__threading_support
@@ -17,7 +17,7 @@
 #include <errno.h>
 
 #ifdef __MVS__
-# include <support/ibm/nanosleep.h>
+# include <__support/ibm/nanosleep.h>
 #endif
 
 #ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER
diff --git a/libcxx/include/bit b/libcxx/include/bit
index fe360179c5ca..f8c37c3d6bbf 100644
--- a/libcxx/include/bit
+++ b/libcxx/include/bit
@@ -62,7 +62,7 @@ namespace std {
 #include <__debug>
 
 #if defined(__IBMCPP__)
-#include "support/ibm/support.h"
+#include "__support/ibm/support.h"
 #endif
 #if defined(_LIBCPP_COMPILER_MSVC)
 #include <intrin.h>
diff --git a/libcxx/include/limits b/libcxx/include/limits
index 6d5d1e1aca75..8f97cd10a8b1 100644
--- a/libcxx/include/limits
+++ b/libcxx/include/limits
@@ -105,11 +105,11 @@ template<> class numeric_limits<cv long double>;
 #include <type_traits>
 
 #if defined(_LIBCPP_COMPILER_MSVC)
-#include "support/win32/limits_msvc_win32.h"
+#include "__support/win32/limits_msvc_win32.h"
 #endif // _LIBCPP_MSVCRT
 
 #if defined(__IBMCPP__)
-#include "support/ibm/limits.h"
+#include "__support/ibm/limits.h"
 #endif // __IBMCPP__
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt
index c482068fa99a..9965104cb5b2 100644
--- a/libcxx/src/CMakeLists.txt
+++ b/libcxx/src/CMakeLists.txt
@@ -107,7 +107,7 @@ endif()
 if (LIBCXX_CONFIGURE_IDE)
   file(GLOB_RECURSE LIBCXX_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../include/*)
   if(WIN32)
-    file( GLOB LIBCXX_WIN32_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../include/support/win32/*.h)
+    file( GLOB LIBCXX_WIN32_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../include/__support/win32/*.h)
     list(APPEND LIBCXX_HEADERS ${LIBCXX_WIN32_HEADERS})
   endif()
   # Force them all into the headers dir on MSVC, otherwise they end up at
diff --git a/libcxx/src/locale.cpp b/libcxx/src/locale.cpp
index f109389f68f3..a0209d0ce8cf 100644
--- a/libcxx/src/locale.cpp
+++ b/libcxx/src/locale.cpp
@@ -29,7 +29,7 @@
 #include "cwctype"
 #include "__sso_allocator"
 #if defined(_LIBCPP_MSVCRT) || defined(__MINGW32__)
-#include "support/win32/locale_win32.h"
+#include "__support/win32/locale_win32.h"
 #elif !defined(__BIONIC__) && !defined(__NuttX__)
 #include <langinfo.h>
 #endif
diff --git a/libcxx/src/support/solaris/xlocale.cpp b/libcxx/src/support/solaris/xlocale.cpp
index d68a39f4dfe5..d25adcd21d30 100644
--- a/libcxx/src/support/solaris/xlocale.cpp
+++ b/libcxx/src/support/solaris/xlocale.cpp
@@ -8,7 +8,7 @@
 
 #ifdef __sun__
 
-#include "support/solaris/xlocale.h"
+#include "__support/solaris/xlocale.h"
 #include <stdarg.h>
 #include <stdio.h>
 #include <sys/localedef.h>
diff --git a/libcxx/src/support/win32/locale_win32.cpp b/libcxx/src/support/win32/locale_win32.cpp
index b7062db352ad..e7c6005fc1a3 100644
--- a/libcxx/src/support/win32/locale_win32.cpp
+++ b/libcxx/src/support/win32/locale_win32.cpp
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-//===-------------------- support/win32/locale_win32.cpp ------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/libcxx/src/support/win32/support.cpp b/libcxx/src/support/win32/support.cpp
index d156e02e3e84..52453f547926 100644
--- a/libcxx/src/support/win32/support.cpp
+++ b/libcxx/src/support/win32/support.cpp
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-//===----------------------- support/win32/support.h ----------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/libcxx/src/support/win32/thread_win32.cpp b/libcxx/src/support/win32/thread_win32.cpp
index 83e7e9f6ce5b..35c4c871457d 100644
--- a/libcxx/src/support/win32/thread_win32.cpp
+++ b/libcxx/src/support/win32/thread_win32.cpp
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-//===-------------------- support/win32/thread_win32.cpp ------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 644f0a767558..2ca495b08fba 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -217,25 +217,25 @@ copy("include") {
     "string.h",
     "string_view",
     "strstream",
-    "support/android/locale_bionic.h",
-    "support/fuchsia/xlocale.h",
-    "support/ibm/limits.h",
-    "support/ibm/locale_mgmt_aix.h",
-    "support/ibm/nanosleep.h",
-    "support/ibm/support.h",
-    "support/ibm/xlocale.h",
-    "support/musl/xlocale.h",
-    "support/newlib/xlocale.h",
-    "support/nuttx/xlocale.h",
-    "support/openbsd/xlocale.h",
-    "support/solaris/floatingpoint.h",
-    "support/solaris/wchar.h",
-    "support/solaris/xlocale.h",
-    "support/win32/limits_msvc_win32.h",
-    "support/win32/locale_win32.h",
-    "support/xlocale/__nop_locale_mgmt.h",
-    "support/xlocale/__posix_l_fallback.h",
-    "support/xlocale/__strtonum_fallback.h",
+    "__support/android/locale_bionic.h",
+    "__support/fuchsia/xlocale.h",
+    "__support/ibm/limits.h",
+    "__support/ibm/locale_mgmt_aix.h",
+    "__support/ibm/nanosleep.h",
+    "__support/ibm/support.h",
+    "__support/ibm/xlocale.h",
+    "__support/musl/xlocale.h",
+    "__support/newlib/xlocale.h",
+    "__support/nuttx/xlocale.h",
+    "__support/openbsd/xlocale.h",
+    "__support/solaris/floatingpoint.h",
+    "__support/solaris/wchar.h",
+    "__support/solaris/xlocale.h",
+    "__support/win32/limits_msvc_win32.h",
+    "__support/win32/locale_win32.h",
+    "__support/xlocale/__nop_locale_mgmt.h",
+    "__support/xlocale/__posix_l_fallback.h",
+    "__support/xlocale/__strtonum_fallback.h",
     "system_error",
     "tgmath.h",
     "thread",

From bc39d53d9a4f1ed7c903648f3fd408296fd55c95 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Mon, 1 Feb 2021 15:18:42 -0800
Subject: [PATCH 070/318] =?UTF-8?q?[=F0=9F=8D=92]Disable=20CFI=20in=20=5F?=
 =?UTF-8?q?=5Fget=5Felem=20to=20allow=20casting=20a=20pointer=20to=20unini?=
 =?UTF-8?q?tialized=20memory?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes usage of shared_ptr with CFI enabled, which is llvm.org/pr48993.

(cherry pick of commit bab74864168bb5e28ecbc0294fe1095d8da7f569)

Differential Revision: https://reviews.llvm.org/D96063
---
 libcxx/include/memory | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/include/memory b/libcxx/include/memory
index a00916c8c03f..39d0f5bee6a5 100644
--- a/libcxx/include/memory
+++ b/libcxx/include/memory
@@ -2647,7 +2647,7 @@ private:
             _Alloc *__alloc = reinterpret_cast<_Alloc*>(__first);
             return __alloc;
         }
-        _Tp* __get_elem() _NOEXCEPT {
+        _LIBCPP_NO_CFI _Tp* __get_elem() _NOEXCEPT {
             _CompressedPair *__as_pair = reinterpret_cast<_CompressedPair*>(__blob_);
             typename _CompressedPair::_Base2* __second = _CompressedPair::__get_second_base(__as_pair);
             _Tp *__elem = reinterpret_cast<_Tp*>(__second);

From 251f3295b498b699aa2b926167a788a6b6dbc033 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Wed, 3 Feb 2021 17:00:20 -0500
Subject: [PATCH 071/318] =?UTF-8?q?[=F0=9F=8D=92][libc++]=20Fix=20libcxx?=
 =?UTF-8?q?=20build=20on=2032bit=20architectures=20with=2064bit=20time=5Ft?=
 =?UTF-8?q?=20defaults=20e.g.=20riscv32?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch by Khem Raj.

(cherry pick of commit 85b9c5ccc172a1e61c7ecaaec4752587cb6f1e26)

Differential Revision: https://reviews.llvm.org/D96062
---
 libcxx/src/atomic.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/libcxx/src/atomic.cpp b/libcxx/src/atomic.cpp
index 6b73ed771cd1..9ae1fb5199bf 100644
--- a/libcxx/src/atomic.cpp
+++ b/libcxx/src/atomic.cpp
@@ -19,6 +19,12 @@
 #include <linux/futex.h>
 #include <sys/syscall.h>
 
+// libc++ uses SYS_futex as a universal syscall name. However, on 32 bit architectures
+// with a 64 bit time_t, we need to specify SYS_futex_time64.
+#if !defined(SYS_futex) && defined(SYS_futex_time64)
+# define SYS_futex SYS_futex_time64
+#endif
+
 #else // <- Add other operating systems here
 
 // Baseline needs no new headers

From d7d818c3615e4ff6bb283df0c1ddbb2b2cd50075 Mon Sep 17 00:00:00 2001
From: Walter Erquinigo <a20012251@gmail.com>
Date: Wed, 27 Jan 2021 13:02:45 -0800
Subject: [PATCH 072/318] Fix runInTerminal failures on Windows

stella.stemenova mentioned in https://reviews.llvm.org/D93951 failures on Windows for this test.

I'm fixing the macro definitions and disabling the tests for python
versions lower than 3.7. I'll figure out that actual issue with
python3.6 after the buildbots are fine again.

(cherry picked from commit ab5591e1d8f5abcfa9e75193d3e8a29087b61425)
---
 .../runInTerminal/TestVSCode_runInTerminal.py | 34 +++++++++++++++----
 lldb/tools/lldb-vscode/FifoFiles.cpp          | 10 +++---
 lldb/tools/lldb-vscode/FifoFiles.h            |  1 +
 lldb/tools/lldb-vscode/lldb-vscode.cpp        |  4 +--
 4 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/lldb/test/API/tools/lldb-vscode/runInTerminal/TestVSCode_runInTerminal.py b/lldb/test/API/tools/lldb-vscode/runInTerminal/TestVSCode_runInTerminal.py
index 055b5a5bed87..047cc317596f 100644
--- a/lldb/test/API/tools/lldb-vscode/runInTerminal/TestVSCode_runInTerminal.py
+++ b/lldb/test/API/tools/lldb-vscode/runInTerminal/TestVSCode_runInTerminal.py
@@ -33,20 +33,30 @@ def readErrorMessage(self, fifo_file):
         with open(fifo_file, "r") as file:
             return file.readline()
 
+    def isTestSupported(self):
+        # For some strange reason, this test fails on python3.6
+        if not (sys.version_info.major == 3 and sys.version_info.minor >= 7):
+            return False
+        try:
+            # We skip this test for debug builds because it takes too long parsing lldb's own
+            # debug info. Release builds are fine.
+            # Checking the size of the lldb-vscode binary seems to be a decent proxy for a quick
+            # detection. It should be far less than 1 MB in Release builds.
+            if os.path.getsize(os.environ["LLDBVSCODE_EXEC"]) < 1000000:
+                return True
+        except:
+            return False
+
     @skipIfWindows
     @skipIfRemote
     @skipIf(archs=no_match(['x86_64']))
     def test_runInTerminal(self):
+        if not self.isTestSupported():
+            return
         '''
             Tests the "runInTerminal" reverse request. It makes sure that the IDE can
             launch the inferior with the correct environment variables and arguments.
         '''
-        if "debug" in str(os.environ["LLDBVSCODE_EXEC"]).lower():
-            # We skip this test for debug builds because it takes too long parsing lldb's own
-            # debug info. Release builds are fine.
-            # Checking this environment variable seems to be a decent proxy for a quick
-            # detection
-            return
         program = self.getBuildArtifact("a.out")
         source = 'main.c'
         self.build_and_launch(
@@ -77,6 +87,8 @@ def test_runInTerminal(self):
     @skipIfRemote
     @skipIf(archs=no_match(['x86_64']))
     def test_runInTerminalInvalidTarget(self):
+        if not self.isTestSupported():
+            return
         self.build_and_create_debug_adaptor()
         response = self.launch(
             "INVALIDPROGRAM", stopOnEntry=True, runInTerminal=True, args=["foobar"], env=["FOO=bar"], expectFailure=True)
@@ -88,6 +100,8 @@ def test_runInTerminalInvalidTarget(self):
     @skipIfRemote
     @skipIf(archs=no_match(['x86_64']))
     def test_missingArgInRunInTerminalLauncher(self):
+        if not self.isTestSupported():
+            return
         proc = subprocess.run([self.lldbVSCodeExec,  "--launch-target", "INVALIDPROGRAM"],
             capture_output=True, universal_newlines=True)
         self.assertTrue(proc.returncode != 0)
@@ -97,6 +111,8 @@ def test_missingArgInRunInTerminalLauncher(self):
     @skipIfRemote
     @skipIf(archs=no_match(['x86_64']))
     def test_FakeAttachedRunInTerminalLauncherWithInvalidProgram(self):
+        if not self.isTestSupported():
+            return
         comm_file = os.path.join(self.getBuildDir(), "comm-file")
         os.mkfifo(comm_file)
 
@@ -115,6 +131,8 @@ def test_FakeAttachedRunInTerminalLauncherWithInvalidProgram(self):
     @skipIfRemote
     @skipIf(archs=no_match(['x86_64']))
     def test_FakeAttachedRunInTerminalLauncherWithValidProgram(self):
+        if not self.isTestSupported():
+            return
         comm_file = os.path.join(self.getBuildDir(), "comm-file")
         os.mkfifo(comm_file)
 
@@ -132,6 +150,8 @@ def test_FakeAttachedRunInTerminalLauncherWithValidProgram(self):
     @skipIfRemote
     @skipIf(archs=no_match(['x86_64']))
     def test_FakeAttachedRunInTerminalLauncherAndCheckEnvironment(self):
+        if not self.isTestSupported():
+            return
         comm_file = os.path.join(self.getBuildDir(), "comm-file")
         os.mkfifo(comm_file)
 
@@ -150,6 +170,8 @@ def test_FakeAttachedRunInTerminalLauncherAndCheckEnvironment(self):
     @skipIfRemote
     @skipIf(archs=no_match(['x86_64']))
     def test_NonAttachedRunInTerminalLauncher(self):
+        if not self.isTestSupported():
+            return
         comm_file = os.path.join(self.getBuildDir(), "comm-file")
         os.mkfifo(comm_file)
 
diff --git a/lldb/tools/lldb-vscode/FifoFiles.cpp b/lldb/tools/lldb-vscode/FifoFiles.cpp
index b69970ec0168..0a36c87d4a94 100644
--- a/lldb/tools/lldb-vscode/FifoFiles.cpp
+++ b/lldb/tools/lldb-vscode/FifoFiles.cpp
@@ -6,7 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#if !defined(WIN32)
+#include "FifoFiles.h"
+
+#if LLVM_ON_UNIX
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
@@ -21,8 +23,6 @@
 
 #include "lldb/lldb-defines.h"
 
-#include "FifoFiles.h"
-
 using namespace llvm;
 
 namespace lldb_vscode {
@@ -30,13 +30,13 @@ namespace lldb_vscode {
 FifoFile::FifoFile(StringRef path) : m_path(path) {}
 
 FifoFile::~FifoFile() {
-#if !defined(WIN32)
+#if LLVM_ON_UNIX
   unlink(m_path.c_str());
 #endif
 };
 
 Expected<std::shared_ptr<FifoFile>> CreateFifoFile(StringRef path) {
-#if defined(WIN32)
+#if !LLVM_ON_UNIX
   return createStringError(inconvertibleErrorCode(), "Unimplemented");
 #else
   if (int err = mkfifo(path.data(), 0600))
diff --git a/lldb/tools/lldb-vscode/FifoFiles.h b/lldb/tools/lldb-vscode/FifoFiles.h
index 891b6f574601..f186f65e86c4 100644
--- a/lldb/tools/lldb-vscode/FifoFiles.h
+++ b/lldb/tools/lldb-vscode/FifoFiles.h
@@ -9,6 +9,7 @@
 #ifndef LLDB_TOOLS_LLDB_VSCODE_FIFOFILES_H
 #define LLDB_TOOLS_LLDB_VSCODE_FIFOFILES_H
 
+#include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX
 #include "llvm/Support/Error.h"
 
 #include "JSONUtils.h"
diff --git a/lldb/tools/lldb-vscode/lldb-vscode.cpp b/lldb/tools/lldb-vscode/lldb-vscode.cpp
index c581b9b4a9a0..69eb2e70aa6d 100644
--- a/lldb/tools/lldb-vscode/lldb-vscode.cpp
+++ b/lldb/tools/lldb-vscode/lldb-vscode.cpp
@@ -3002,8 +3002,8 @@ static void printHelp(LLDBVSCodeOptTable &table, llvm::StringRef tool_name) {
 // emitted to the debug adaptor.
 void LaunchRunInTerminalTarget(llvm::opt::Arg &target_arg,
                                llvm::StringRef comm_file, char *argv[]) {
-#if defined(WIN_32)
-  llvm::errs() << "runInTerminal is not supported on Windows\n";
+#if !LLVM_ON_UNIX
+  llvm::errs() << "runInTerminal is only supported on POSIX systems\n";
   exit(EXIT_FAILURE);
 #else
   RunInTerminalLauncherCommChannel comm_channel(comm_file);

From 27aff2aa2ade9d78d0081445eadacd5b5006143e Mon Sep 17 00:00:00 2001
From: Walter Erquinigo <a20012251@gmail.com>
Date: Thu, 28 Jan 2021 09:24:30 -0800
Subject: [PATCH 073/318] Fix lldb-vscode builds on Windows targeting POSIX

@stella.stamenova found out that lldb-vscode's Win32 macros were failing
when building on windows targetings POSIX platforms.

I'm changing these macros for LLVM_ON_UNIX, which should be more
accurate.

(cherry picked from commit 0bca9a7ce2eeaa9f1d732ffbc17769560a2b236e)
---
 lldb/tools/lldb-vscode/IOStream.cpp      |  6 +++---
 lldb/tools/lldb-vscode/IOStream.h        |  4 +++-
 lldb/tools/lldb-vscode/RunInTerminal.cpp |  6 +++---
 lldb/tools/lldb-vscode/VSCode.cpp        |  4 ++--
 lldb/tools/lldb-vscode/VSCode.h          |  2 ++
 lldb/tools/lldb-vscode/lldb-vscode.cpp   | 11 ++++++-----
 6 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/lldb/tools/lldb-vscode/IOStream.cpp b/lldb/tools/lldb-vscode/IOStream.cpp
index 4b11b90b4c2e..fdbfb554aedb 100644
--- a/lldb/tools/lldb-vscode/IOStream.cpp
+++ b/lldb/tools/lldb-vscode/IOStream.cpp
@@ -8,7 +8,7 @@
 
 #include "IOStream.h"
 
-#if defined(_WIN32)
+#if !LLVM_ON_UNIX
 #include <io.h>
 #else
 #include <netinet/in.h>
@@ -33,7 +33,7 @@ StreamDescriptor::~StreamDescriptor() {
     return;
 
   if (m_is_socket)
-#if defined(_WIN32)
+#if !LLVM_ON_UNIX
     ::closesocket(m_socket);
 #else
     ::close(m_socket);
@@ -108,7 +108,7 @@ bool InputStream::read_full(std::ofstream *log, size_t length,
     }
     if (bytes_read < 0) {
       int reason = 0;
-#if defined(_WIN32)
+#if !LLVM_ON_UNIX
       if (descriptor.m_is_socket)
         reason = WSAGetLastError();
       else
diff --git a/lldb/tools/lldb-vscode/IOStream.h b/lldb/tools/lldb-vscode/IOStream.h
index 603ae9adcc2a..1ec7ac3ed0f9 100644
--- a/lldb/tools/lldb-vscode/IOStream.h
+++ b/lldb/tools/lldb-vscode/IOStream.h
@@ -9,7 +9,9 @@
 #ifndef LLDB_TOOLS_LLDB_VSCODE_IOSTREAM_H
 #define LLDB_TOOLS_LLDB_VSCODE_IOSTREAM_H
 
-#if defined(_WIN32)
+#include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX
+
+#if !LLVM_ON_UNIX
 // We need to #define NOMINMAX in order to skip `min()` and `max()` macro
 // definitions that conflict with other system headers.
 // We also need to #undef GetObject (which is defined to GetObjectW) because
diff --git a/lldb/tools/lldb-vscode/RunInTerminal.cpp b/lldb/tools/lldb-vscode/RunInTerminal.cpp
index 4db2806924ca..29edf5ca381d 100644
--- a/lldb/tools/lldb-vscode/RunInTerminal.cpp
+++ b/lldb/tools/lldb-vscode/RunInTerminal.cpp
@@ -6,7 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#if !defined(WIN32)
+#include "RunInTerminal.h"
+
+#if LLVM_ON_UNIX
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
@@ -21,8 +23,6 @@
 
 #include "lldb/lldb-defines.h"
 
-#include "RunInTerminal.h"
-
 using namespace llvm;
 
 namespace lldb_vscode {
diff --git a/lldb/tools/lldb-vscode/VSCode.cpp b/lldb/tools/lldb-vscode/VSCode.cpp
index e9fdc17f4147..4d0e281c1b8d 100644
--- a/lldb/tools/lldb-vscode/VSCode.cpp
+++ b/lldb/tools/lldb-vscode/VSCode.cpp
@@ -14,7 +14,7 @@
 #include "VSCode.h"
 #include "llvm/Support/FormatVariadic.h"
 
-#if defined(_WIN32)
+#if !LLVM_ON_UNIX
 #define NOMINMAX
 #include <fcntl.h>
 #include <io.h>
@@ -41,7 +41,7 @@ VSCode::VSCode()
       stop_at_entry(false), is_attach(false),
       reverse_request_seq(0), waiting_for_run_in_terminal(false) {
   const char *log_file_path = getenv("LLDBVSCODE_LOG");
-#if defined(_WIN32)
+#if !LLVM_ON_UNIX
   // Windows opens stdout and stdin in text mode which converts \n to 13,10
   // while the value is just 10 on Darwin/Linux. Setting the file mode to binary
   // fixes this.
diff --git a/lldb/tools/lldb-vscode/VSCode.h b/lldb/tools/lldb-vscode/VSCode.h
index 8e7dfc078934..a2e1cac8ecf9 100644
--- a/lldb/tools/lldb-vscode/VSCode.h
+++ b/lldb/tools/lldb-vscode/VSCode.h
@@ -9,6 +9,8 @@
 #ifndef LLDB_TOOLS_LLDB_VSCODE_VSCODE_H
 #define LLDB_TOOLS_LLDB_VSCODE_VSCODE_H
 
+#include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX
+
 #include <condition_variable>
 #include <iosfwd>
 #include <map>
diff --git a/lldb/tools/lldb-vscode/lldb-vscode.cpp b/lldb/tools/lldb-vscode/lldb-vscode.cpp
index 69eb2e70aa6d..b7f39cbb1cb5 100644
--- a/lldb/tools/lldb-vscode/lldb-vscode.cpp
+++ b/lldb/tools/lldb-vscode/lldb-vscode.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "VSCode.h"
+
 #include <assert.h>
 #include <limits.h>
 #include <stdarg.h>
@@ -14,7 +16,7 @@
 #include <string.h>
 #include <sys/stat.h>
 #include <sys/types.h>
-#if defined(_WIN32)
+#if !LLVM_ON_UNIX
 // We need to #define NOMINMAX in order to skip `min()` and `max()` macro
 // definitions that conflict with other system headers.
 // We also need to #undef GetObject (which is defined to GetObjectW) because
@@ -52,9 +54,8 @@
 
 #include "JSONUtils.h"
 #include "LLDBUtils.h"
-#include "VSCode.h"
 
-#if defined(_WIN32)
+#if !LLVM_ON_UNIX
 #ifndef PATH_MAX
 #define PATH_MAX MAX_PATH
 #endif
@@ -131,7 +132,7 @@ SOCKET AcceptConnection(int portno) {
           *g_vsc.log << "error: accept (" << strerror(errno) << ")"
                      << std::endl;
     }
-#if defined(_WIN32)
+#if !LLVM_ON_UNIX
     closesocket(sockfd);
 #else
     close(sockfd);
@@ -3084,7 +3085,7 @@ int main(int argc, char *argv[]) {
     }
   }
 
-#if !defined(_WIN32)
+#if LLVM_ON_UNIX
   if (input_args.hasArg(OPT_wait_for_debugger)) {
     printf("Paused waiting for debugger to attach (pid = %i)...\n", getpid());
     pause();

From 1cb6551edb94eea1fc087b346b1e8d13775dc692 Mon Sep 17 00:00:00 2001
From: Walter Erquinigo <a20012251@gmail.com>
Date: Thu, 4 Feb 2021 10:07:07 -0800
Subject: [PATCH 074/318] [lldb-vscode] correctly use Windows macros

@mstorsjo found a mistake that I made when trying to fix some Windows
compilation errors encountered by @stella.stamenova.

I was incorrectly using the LLVM_ON_UNIX macro. In any case, proper use
of

  #if defined(_WIN32)

should be the actual fix.

Differential Revision: https://reviews.llvm.org/D96060

(cherry picked from commit 36496cc2992d6fa26e6024971efcfc7d15f69888)
---
 lldb/tools/lldb-vscode/FifoFiles.cpp     |  6 +++---
 lldb/tools/lldb-vscode/IOStream.cpp      |  6 +++---
 lldb/tools/lldb-vscode/IOStream.h        |  2 +-
 lldb/tools/lldb-vscode/RunInTerminal.cpp |  2 +-
 lldb/tools/lldb-vscode/VSCode.cpp        |  4 ++--
 lldb/tools/lldb-vscode/lldb-vscode.cpp   | 10 +++++-----
 6 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/lldb/tools/lldb-vscode/FifoFiles.cpp b/lldb/tools/lldb-vscode/FifoFiles.cpp
index 0a36c87d4a94..4b14fb16f96c 100644
--- a/lldb/tools/lldb-vscode/FifoFiles.cpp
+++ b/lldb/tools/lldb-vscode/FifoFiles.cpp
@@ -8,7 +8,7 @@
 
 #include "FifoFiles.h"
 
-#if LLVM_ON_UNIX
+#if !defined(_WIN32)
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
@@ -30,13 +30,13 @@ namespace lldb_vscode {
 FifoFile::FifoFile(StringRef path) : m_path(path) {}
 
 FifoFile::~FifoFile() {
-#if LLVM_ON_UNIX
+#if !defined(_WIN32)
   unlink(m_path.c_str());
 #endif
 };
 
 Expected<std::shared_ptr<FifoFile>> CreateFifoFile(StringRef path) {
-#if !LLVM_ON_UNIX
+#if defined(_WIN32)
   return createStringError(inconvertibleErrorCode(), "Unimplemented");
 #else
   if (int err = mkfifo(path.data(), 0600))
diff --git a/lldb/tools/lldb-vscode/IOStream.cpp b/lldb/tools/lldb-vscode/IOStream.cpp
index fdbfb554aedb..cd22d906c14c 100644
--- a/lldb/tools/lldb-vscode/IOStream.cpp
+++ b/lldb/tools/lldb-vscode/IOStream.cpp
@@ -8,7 +8,7 @@
 
 #include "IOStream.h"
 
-#if !LLVM_ON_UNIX
+#if defined(_WIN32) 
 #include <io.h>
 #else
 #include <netinet/in.h>
@@ -33,7 +33,7 @@ StreamDescriptor::~StreamDescriptor() {
     return;
 
   if (m_is_socket)
-#if !LLVM_ON_UNIX
+#if defined(_WIN32)
     ::closesocket(m_socket);
 #else
     ::close(m_socket);
@@ -108,7 +108,7 @@ bool InputStream::read_full(std::ofstream *log, size_t length,
     }
     if (bytes_read < 0) {
       int reason = 0;
-#if !LLVM_ON_UNIX
+#if defined(_WIN32)
       if (descriptor.m_is_socket)
         reason = WSAGetLastError();
       else
diff --git a/lldb/tools/lldb-vscode/IOStream.h b/lldb/tools/lldb-vscode/IOStream.h
index 1ec7ac3ed0f9..0eb9b6fefb0d 100644
--- a/lldb/tools/lldb-vscode/IOStream.h
+++ b/lldb/tools/lldb-vscode/IOStream.h
@@ -11,7 +11,7 @@
 
 #include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX
 
-#if !LLVM_ON_UNIX
+#if defined(_WIN32)
 // We need to #define NOMINMAX in order to skip `min()` and `max()` macro
 // definitions that conflict with other system headers.
 // We also need to #undef GetObject (which is defined to GetObjectW) because
diff --git a/lldb/tools/lldb-vscode/RunInTerminal.cpp b/lldb/tools/lldb-vscode/RunInTerminal.cpp
index 29edf5ca381d..2126563d9e96 100644
--- a/lldb/tools/lldb-vscode/RunInTerminal.cpp
+++ b/lldb/tools/lldb-vscode/RunInTerminal.cpp
@@ -8,7 +8,7 @@
 
 #include "RunInTerminal.h"
 
-#if LLVM_ON_UNIX
+#if !defined(_WIN32)
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
diff --git a/lldb/tools/lldb-vscode/VSCode.cpp b/lldb/tools/lldb-vscode/VSCode.cpp
index 4d0e281c1b8d..e9fdc17f4147 100644
--- a/lldb/tools/lldb-vscode/VSCode.cpp
+++ b/lldb/tools/lldb-vscode/VSCode.cpp
@@ -14,7 +14,7 @@
 #include "VSCode.h"
 #include "llvm/Support/FormatVariadic.h"
 
-#if !LLVM_ON_UNIX
+#if defined(_WIN32)
 #define NOMINMAX
 #include <fcntl.h>
 #include <io.h>
@@ -41,7 +41,7 @@ VSCode::VSCode()
       stop_at_entry(false), is_attach(false),
       reverse_request_seq(0), waiting_for_run_in_terminal(false) {
   const char *log_file_path = getenv("LLDBVSCODE_LOG");
-#if !LLVM_ON_UNIX
+#if defined(_WIN32)
   // Windows opens stdout and stdin in text mode which converts \n to 13,10
   // while the value is just 10 on Darwin/Linux. Setting the file mode to binary
   // fixes this.
diff --git a/lldb/tools/lldb-vscode/lldb-vscode.cpp b/lldb/tools/lldb-vscode/lldb-vscode.cpp
index b7f39cbb1cb5..9469690cd7db 100644
--- a/lldb/tools/lldb-vscode/lldb-vscode.cpp
+++ b/lldb/tools/lldb-vscode/lldb-vscode.cpp
@@ -16,7 +16,7 @@
 #include <string.h>
 #include <sys/stat.h>
 #include <sys/types.h>
-#if !LLVM_ON_UNIX
+#if defined(_WIN32)
 // We need to #define NOMINMAX in order to skip `min()` and `max()` macro
 // definitions that conflict with other system headers.
 // We also need to #undef GetObject (which is defined to GetObjectW) because
@@ -55,7 +55,7 @@
 #include "JSONUtils.h"
 #include "LLDBUtils.h"
 
-#if !LLVM_ON_UNIX
+#if defined(_WIN32)
 #ifndef PATH_MAX
 #define PATH_MAX MAX_PATH
 #endif
@@ -132,7 +132,7 @@ SOCKET AcceptConnection(int portno) {
           *g_vsc.log << "error: accept (" << strerror(errno) << ")"
                      << std::endl;
     }
-#if !LLVM_ON_UNIX
+#if defined(_WIN32)
     closesocket(sockfd);
 #else
     close(sockfd);
@@ -3003,7 +3003,7 @@ static void printHelp(LLDBVSCodeOptTable &table, llvm::StringRef tool_name) {
 // emitted to the debug adaptor.
 void LaunchRunInTerminalTarget(llvm::opt::Arg &target_arg,
                                llvm::StringRef comm_file, char *argv[]) {
-#if !LLVM_ON_UNIX
+#if defined(_WIN32)
   llvm::errs() << "runInTerminal is only supported on POSIX systems\n";
   exit(EXIT_FAILURE);
 #else
@@ -3085,7 +3085,7 @@ int main(int argc, char *argv[]) {
     }
   }
 
-#if LLVM_ON_UNIX
+#if !defined(_WIN32)
   if (input_args.hasArg(OPT_wait_for_debugger)) {
     printf("Paused waiting for debugger to attach (pid = %i)...\n", getpid());
     pause();

From c9fb4a947e32abfaa73b0b91a58ef71c73316322 Mon Sep 17 00:00:00 2001
From: Zequan Wu <zequanwu@google.com>
Date: Thu, 4 Feb 2021 17:00:09 -0800
Subject: [PATCH 075/318] [AST] Update LVal before evaluating lambda decl
 fields.

Differential Revision: https://reviews.llvm.org/D96092

(cherry picked from commit 96fb49c3ff8e08680127ddd4ec45a0e6c199243b)
---
 clang/lib/AST/ExprConstant.cpp                   |  8 +++++++-
 clang/test/SemaCXX/constant-expression-cxx2a.cpp | 10 ++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 56181bbe1166..1c4caa2c1fc0 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -10009,6 +10009,7 @@ bool RecordExprEvaluator::VisitLambdaExpr(const LambdaExpr *E) {
   auto *CaptureInitIt = E->capture_init_begin();
   const LambdaCapture *CaptureIt = ClosureClass->captures_begin();
   bool Success = true;
+  const ASTRecordLayout &Layout = Info.Ctx.getASTRecordLayout(ClosureClass);
   for (const auto *Field : ClosureClass->fields()) {
     assert(CaptureInitIt != E->capture_init_end());
     // Get the initializer for this field
@@ -10019,8 +10020,13 @@ bool RecordExprEvaluator::VisitLambdaExpr(const LambdaExpr *E) {
     if (!CurFieldInit)
       return Error(E);
 
+    LValue Subobject = This;
+
+    if (!HandleLValueMember(Info, E, Subobject, Field, &Layout))
+      return false;
+
     APValue &FieldVal = Result.getStructField(Field->getFieldIndex());
-    if (!EvaluateInPlace(FieldVal, Info, This, CurFieldInit)) {
+    if (!EvaluateInPlace(FieldVal, Info, Subobject, CurFieldInit)) {
       if (!Info.keepEvaluatingAfterFailure())
         return false;
       Success = false;
diff --git a/clang/test/SemaCXX/constant-expression-cxx2a.cpp b/clang/test/SemaCXX/constant-expression-cxx2a.cpp
index 4adadc9988ab..86020a09db44 100644
--- a/clang/test/SemaCXX/constant-expression-cxx2a.cpp
+++ b/clang/test/SemaCXX/constant-expression-cxx2a.cpp
@@ -1437,3 +1437,13 @@ constexpr bool destroy_at_test() {
   return true;
 }
 static_assert(destroy_at_test());
+
+namespace PR48582 {
+  struct S {
+    void *p = this;
+    constexpr S() {}
+    constexpr S(const S&) {}
+  };
+  constexpr bool b = [a = S(), b = S()] { return a.p == b.p; }();
+  static_assert(!b);
+}

From 8153dee37272a73b1ed74ac1bc12422fac8ef033 Mon Sep 17 00:00:00 2001
From: Richard Smith <richard@metafoo.co.uk>
Date: Mon, 8 Feb 2021 17:58:05 -0800
Subject: [PATCH 076/318] PR48606: The lifetime of a constexpr heap allocation
 always started during the same evaluation.

It looks like the only case for which this matters is determining
whether mutable subobjects of a heap allocation can be modified during
constant evaluation.

(cherry picked from commit 21e8bb83253e1a2f4b6fad9b53cafe8c530a38e2)
---
 clang/lib/AST/ExprConstant.cpp                |  4 +--
 .../test/SemaCXX/cxx2a-constexpr-dynalloc.cpp | 34 +++++++++++++++++++
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 1c4caa2c1fc0..cd2b5141ebe8 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -3497,8 +3497,8 @@ static bool diagnoseMutableFields(EvalInfo &Info, const Expr *E, AccessKinds AK,
 static bool lifetimeStartedInEvaluation(EvalInfo &Info,
                                         APValue::LValueBase Base,
                                         bool MutableSubobject = false) {
-  // A temporary we created.
-  if (Base.getCallIndex())
+  // A temporary or transient heap allocation we created.
+  if (Base.getCallIndex() || Base.is<DynamicAllocLValue>())
     return true;
 
   switch (Info.IsEvaluatingDecl) {
diff --git a/clang/test/SemaCXX/cxx2a-constexpr-dynalloc.cpp b/clang/test/SemaCXX/cxx2a-constexpr-dynalloc.cpp
index 3647526ff0af..097ca00640e9 100644
--- a/clang/test/SemaCXX/cxx2a-constexpr-dynalloc.cpp
+++ b/clang/test/SemaCXX/cxx2a-constexpr-dynalloc.cpp
@@ -176,3 +176,37 @@ constexpr bool construct_after_lifetime_2() {
   return true;
 }
 static_assert(construct_after_lifetime_2()); // expected-error {{}} expected-note {{in call}}
+
+namespace PR48606 {
+  struct A { mutable int n = 0; };
+
+  constexpr bool f() {
+    A a;
+    A *p = &a;
+    p->~A();
+    std::construct_at<A>(p);
+    return true;
+  }
+  static_assert(f());
+
+  constexpr bool g() {
+    A *p = new A;
+    p->~A();
+    std::construct_at<A>(p);
+    delete p;
+    return true;
+  }
+  static_assert(g());
+
+  constexpr bool h() {
+    std::allocator<A> alloc;
+    A *p = alloc.allocate(1);
+    std::construct_at<A>(p);
+    p->~A();
+    std::construct_at<A>(p);
+    p->~A();
+    alloc.deallocate(p);
+    return true;
+  }
+  static_assert(h());
+}

From b46924ee5afe234526220c29a497794bf65f8f7f Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 27 Jan 2021 10:14:54 +0000
Subject: [PATCH 077/318] Fix "not all control paths return a value" warning.
 NFCI.

---
 clang/lib/Basic/ProfileList.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/lib/Basic/ProfileList.cpp b/clang/lib/Basic/ProfileList.cpp
index 56bc37a79301..2cb05c1c3c07 100644
--- a/clang/lib/Basic/ProfileList.cpp
+++ b/clang/lib/Basic/ProfileList.cpp
@@ -82,6 +82,7 @@ static StringRef getSectionName(CodeGenOptions::ProfileInstrKind Kind) {
   case CodeGenOptions::ProfileCSIRInstr:
     return "csllvm";
   }
+  llvm_unreachable("Unhandled CodeGenOptions::ProfileInstrKind enum");
 }
 
 llvm::Optional<bool>

From 8d20c14a8a3dd0f83d4066f957ba4c006d29942b Mon Sep 17 00:00:00 2001
From: Nathan James <n.james93@hotmail.co.uk>
Date: Fri, 12 Feb 2021 16:55:44 +0000
Subject: [PATCH 078/318] [clangd] Fix clang tidy provider when multiple config
 files exist in directory tree

Currently Clang tidy provider searches from the root directory up to the target directory, this is the opposite of how clang-tidy searches for config files.
The result of this is .clang-tidy files are ignored in any subdirectory of a directory containing a .clang-tidy file.

Reviewed By: sammccall

Differential Revision: https://reviews.llvm.org/D96204

(cherry picked from commit ba3ea9c60f0f259f0ccc47e47daf8253a5885531)
---
 clang-tools-extra/clangd/TidyProvider.cpp     |  2 +-
 .../clangd/unittests/CMakeLists.txt           |  1 +
 .../clangd/unittests/TidyProviderTests.cpp    | 60 +++++++++++++++++++
 3 files changed, 62 insertions(+), 1 deletion(-)
 create mode 100644 clang-tools-extra/clangd/unittests/TidyProviderTests.cpp

diff --git a/clang-tools-extra/clangd/TidyProvider.cpp b/clang-tools-extra/clangd/TidyProvider.cpp
index c26c59fd347d..bcf1cd5a6183 100644
--- a/clang-tools-extra/clangd/TidyProvider.cpp
+++ b/clang-tools-extra/clangd/TidyProvider.cpp
@@ -106,7 +106,7 @@ class DotClangTidyTree {
     llvm::SmallVector<DotClangTidyCache *> Caches;
     {
       std::lock_guard<std::mutex> Lock(Mu);
-      for (auto I = path::begin(Parent), E = path::end(Parent); I != E; ++I) {
+      for (auto I = path::rbegin(Parent), E = path::rend(Parent); I != E; ++I) {
         assert(I->end() >= Parent.begin() && I->end() <= Parent.end() &&
                "Canonical path components should be substrings");
         llvm::StringRef Ancestor(Parent.begin(), I->end() - Parent.begin());
diff --git a/clang-tools-extra/clangd/unittests/CMakeLists.txt b/clang-tools-extra/clangd/unittests/CMakeLists.txt
index adf4ac827cce..f4d364720eaf 100644
--- a/clang-tools-extra/clangd/unittests/CMakeLists.txt
+++ b/clang-tools-extra/clangd/unittests/CMakeLists.txt
@@ -93,6 +93,7 @@ add_unittest(ClangdUnitTests ClangdTests
   TestIndex.cpp
   TestTU.cpp
   TestWorkspace.cpp
+  TidyProviderTests.cpp
   TypeHierarchyTests.cpp
   URITests.cpp
   XRefsTests.cpp
diff --git a/clang-tools-extra/clangd/unittests/TidyProviderTests.cpp b/clang-tools-extra/clangd/unittests/TidyProviderTests.cpp
new file mode 100644
index 000000000000..a16c87456a1a
--- /dev/null
+++ b/clang-tools-extra/clangd/unittests/TidyProviderTests.cpp
@@ -0,0 +1,60 @@
+//===-- TidyProviderTests.cpp - Clang tidy configuration provider tests ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TestFS.h"
+#include "TidyProvider.h"
+#include "gtest/gtest.h"
+
+namespace clang {
+namespace clangd {
+
+namespace {
+
+TEST(TidyProvider, NestedDirectories) {
+  MockFS FS;
+  FS.Files[testPath(".clang-tidy")] = R"yaml(
+  Checks: 'llvm-*'
+  CheckOptions:
+    - key: TestKey
+      value: 1
+)yaml";
+  FS.Files[testPath("sub1/.clang-tidy")] = R"yaml(
+  Checks: 'misc-*'
+  CheckOptions:
+    - key: TestKey
+      value: 2
+)yaml";
+  FS.Files[testPath("sub1/sub2/.clang-tidy")] = R"yaml(
+  Checks: 'bugprone-*'
+  CheckOptions:
+    - key: TestKey
+      value: 3
+  InheritParentConfig: true
+)yaml";
+
+  TidyProvider Provider = provideClangTidyFiles(FS);
+
+  auto BaseOptions = getTidyOptionsForFile(Provider, testPath("File.cpp"));
+  ASSERT_TRUE(BaseOptions.Checks.hasValue());
+  EXPECT_EQ(*BaseOptions.Checks, "llvm-*");
+  EXPECT_EQ(BaseOptions.CheckOptions.lookup("TestKey").Value, "1");
+
+  auto Sub1Options = getTidyOptionsForFile(Provider, testPath("sub1/File.cpp"));
+  ASSERT_TRUE(Sub1Options.Checks.hasValue());
+  EXPECT_EQ(*Sub1Options.Checks, "misc-*");
+  EXPECT_EQ(Sub1Options.CheckOptions.lookup("TestKey").Value, "2");
+
+  auto Sub2Options =
+      getTidyOptionsForFile(Provider, testPath("sub1/sub2/File.cpp"));
+  ASSERT_TRUE(Sub2Options.Checks.hasValue());
+  EXPECT_EQ(*Sub2Options.Checks, "misc-*,bugprone-*");
+  EXPECT_EQ(Sub2Options.CheckOptions.lookup("TestKey").Value, "3");
+}
+} // namespace
+} // namespace clangd
+} // namespace clang

From 6604c3050948d602ef24b3d3efbf9f4410494833 Mon Sep 17 00:00:00 2001
From: Jessica Paquette <jpaquette@apple.com>
Date: Tue, 2 Feb 2021 14:21:33 -0800
Subject: [PATCH 079/318] [GlobalISel] Check if branches use the same MBB in
 matchOptBrCondByInvertingCond

If the G_BR + G_BRCOND in this combine use the same MBB, then it will infinite
loop. Don't allow that to happen.

Differential Revision: https://reviews.llvm.org/D95895

(cherry picked from commit 02d4b365bf4f8c2cb56e5612902f6c3bb4316493)
---
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp |  9 +++----
 .../GlobalISel/prelegalizercombiner-br.mir    | 24 +++++++++++++++++++
 2 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index df0219fcfa64..a9353bdfb780 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -968,10 +968,11 @@ bool CombinerHelper::matchOptBrCondByInvertingCond(MachineInstr &MI) {
   if (BrCond->getOpcode() != TargetOpcode::G_BRCOND)
     return false;
 
-  // Check that the next block is the conditional branch target.
-  if (!MBB->isLayoutSuccessor(BrCond->getOperand(1).getMBB()))
-    return false;
-  return true;
+  // Check that the next block is the conditional branch target. Also make sure
+  // that it isn't the same as the G_BR's target (otherwise, this will loop.)
+  MachineBasicBlock *BrCondTarget = BrCond->getOperand(1).getMBB();
+  return BrCondTarget != MI.getOperand(0).getMBB() &&
+         MBB->isLayoutSuccessor(BrCondTarget);
 }
 
 void CombinerHelper::applyOptBrCondByInvertingCond(MachineInstr &MI) {
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-br.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-br.mir
index 0631ff89ade0..0647de44c4b8 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-br.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-br.mir
@@ -29,6 +29,7 @@
     ret i32 %retval.0
   }
 
+  define void @dont_combine_same_block() { ret void }
 
 ...
 ---
@@ -87,3 +88,26 @@ body:             |
     RET_ReallyLR implicit $w0
 
 ...
+---
+name:            dont_combine_same_block
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: dont_combine_same_block
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $w0, $w1
+  ; CHECK:   %cond:_(s1) = G_IMPLICIT_DEF
+  ; CHECK:   G_BRCOND %cond(s1), %bb.1
+  ; CHECK:   G_BR %bb.1
+  ; CHECK: bb.1:
+  ; CHECK:   RET_ReallyLR
+  bb.0:
+    liveins: $w0, $w1
+    %cond:_(s1) = G_IMPLICIT_DEF
+
+    ; The G_BRCOND and G_BR have the same target here. Don't change anything.
+    G_BRCOND %cond(s1), %bb.1
+    G_BR %bb.1
+  bb.1:
+    RET_ReallyLR
+...

From 04cb6b5ea8bd2b52e3d11f4cb970fd2d144eee6a Mon Sep 17 00:00:00 2001
From: Richard Smith <richard@metafoo.co.uk>
Date: Mon, 8 Feb 2021 17:32:52 -0800
Subject: [PATCH 080/318] PR48587: is_constant_evaluated() should not evaluate
 to true during a variable's destruction if it didn't do so during
 construction.

The standard doesn't give any guidance as to what to do here, but this
approach seems reasonable and conservative, and has been proposed to the
standard committee.

(cherry picked from commit c945dc4a5023d6a17d11fcda76509b94b36e34fc)
---
 clang/lib/AST/ExprConstant.cpp                | 19 +++-
 .../builtin-is-constant-evaluated.cpp         | 92 +++++++++++++++++++
 2 files changed, 106 insertions(+), 5 deletions(-)

diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index cd2b5141ebe8..1bdad771a923 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -14792,11 +14792,14 @@ bool Expr::EvaluateAsLValue(EvalResult &Result, const ASTContext &Ctx,
 
 static bool EvaluateDestruction(const ASTContext &Ctx, APValue::LValueBase Base,
                                 APValue DestroyedValue, QualType Type,
-                                SourceLocation Loc, Expr::EvalStatus &EStatus) {
-  EvalInfo Info(Ctx, EStatus, EvalInfo::EM_ConstantExpression);
+                                SourceLocation Loc, Expr::EvalStatus &EStatus,
+                                bool IsConstantDestruction) {
+  EvalInfo Info(Ctx, EStatus,
+                IsConstantDestruction ? EvalInfo::EM_ConstantExpression
+                                      : EvalInfo::EM_ConstantFold);
   Info.setEvaluatingDecl(Base, DestroyedValue,
                          EvalInfo::EvaluatingDeclKind::Dtor);
-  Info.InConstantContext = true;
+  Info.InConstantContext = IsConstantDestruction;
 
   LValue LVal;
   LVal.set(Base);
@@ -14850,7 +14853,8 @@ bool Expr::EvaluateAsConstantExpr(EvalResult &Result, const ASTContext &Ctx,
   // If this is a class template argument, it's required to have constant
   // destruction too.
   if (Kind == ConstantExprKind::ClassTemplateArgument &&
-      (!EvaluateDestruction(Ctx, Base, Result.Val, T, getBeginLoc(), Result) ||
+      (!EvaluateDestruction(Ctx, Base, Result.Val, T, getBeginLoc(), Result,
+                            true) ||
        Result.HasSideEffects)) {
     // FIXME: Prefix a note to indicate that the problem is lack of constant
     // destruction.
@@ -14916,6 +14920,10 @@ bool VarDecl::evaluateDestruction(
   Expr::EvalStatus EStatus;
   EStatus.Diag = &Notes;
 
+  // Only treat the destruction as constant destruction if we formally have
+  // constant initialization (or are usable in a constant expression).
+  bool IsConstantDestruction = hasConstantInitialization();
+
   // Make a copy of the value for the destructor to mutate, if we know it.
   // Otherwise, treat the value as default-initialized; if the destructor works
   // anyway, then the destruction is constant (and must be essentially empty).
@@ -14926,7 +14934,8 @@ bool VarDecl::evaluateDestruction(
     return false;
 
   if (!EvaluateDestruction(getASTContext(), this, std::move(DestroyedValue),
-                           getType(), getLocation(), EStatus) ||
+                           getType(), getLocation(), EStatus,
+                           IsConstantDestruction) ||
       EStatus.HasSideEffects)
     return false;
 
diff --git a/clang/test/CodeGenCXX/builtin-is-constant-evaluated.cpp b/clang/test/CodeGenCXX/builtin-is-constant-evaluated.cpp
index 967c83496ab9..d30fefe55b4f 100644
--- a/clang/test/CodeGenCXX/builtin-is-constant-evaluated.cpp
+++ b/clang/test/CodeGenCXX/builtin-is-constant-evaluated.cpp
@@ -4,6 +4,7 @@
 // RUN: FileCheck -check-prefix=CHECK-DYN -input-file=%t.ll %s
 // RUN: FileCheck -check-prefix=CHECK-ARR -input-file=%t.ll %s
 // RUN: FileCheck -check-prefix=CHECK-FOLD -input-file=%t.ll %s
+// RUN: FileCheck -check-prefix=CHECK-DTOR -input-file=%t.ll %s
 
 using size_t = decltype(sizeof(int));
 
@@ -131,3 +132,94 @@ void test_ref_to_static_var() {
   // CHECK-FOLD: store i32* @_ZZ22test_ref_to_static_varvE10i_constant, i32** %r,
   int &r = __builtin_is_constant_evaluated() ? i_constant : i_non_constant;
 }
+
+int not_constexpr;
+
+// __builtin_is_constant_evaluated() should never evaluate to true during
+// destruction if it would not have done so during construction.
+//
+// FIXME: The standard doesn't say that it should ever return true when
+// evaluating a destructor call, even for a constexpr variable. That seems
+// obviously wrong.
+struct DestructorBCE {
+  int n;
+  constexpr DestructorBCE(int n) : n(n) {}
+  constexpr ~DestructorBCE() {
+    if (!__builtin_is_constant_evaluated())
+      not_constexpr = 1;
+  }
+};
+
+// CHECK-DTOR-NOT: @_ZN13DestructorBCED{{.*}}@global_dtor_bce_1
+DestructorBCE global_dtor_bce_1(101);
+
+// CHECK-DTOR: load i32, i32* @not_constexpr
+// CHECK-DTOR: call {{.*}} @_ZN13DestructorBCEC1Ei({{.*}} @global_dtor_bce_2, i32
+// CHECK-DTOR: atexit{{.*}} @_ZN13DestructorBCED{{.*}} @global_dtor_bce_2
+// CHECK-DTOR: }
+DestructorBCE global_dtor_bce_2(not_constexpr);
+
+// CHECK-DTOR-NOT: @_ZN13DestructorBCED{{.*}}@global_dtor_bce_3
+constexpr DestructorBCE global_dtor_bce_3(103);
+
+// CHECK-DTOR-LABEL: define {{.*}} @_Z15test_dtor_bce_1v(
+void test_dtor_bce_1() {
+  // Variable is neither constant initialized (because it has automatic storage
+  // duration) nor usable in constant expressions, so BCE should not return
+  // true during destruction. It would be OK if we replaced the constructor
+  // call with a direct store, but we should emit the destructor call.
+
+  // CHECK-DTOR: call {{.*}} @_ZN13DestructorBCEC1Ei({{.*}}, i32 201)
+  DestructorBCE local(201);
+  // CHECK-DTOR: call {{.*}} @_ZN13DestructorBCED
+  // CHECK-DTOR: }
+}
+
+// CHECK-DTOR-LABEL: define {{.*}} @_Z15test_dtor_bce_2v(
+void test_dtor_bce_2() {
+  // Non-constant init => BCE is false in destructor.
+
+  // CHECK-DTOR: call {{.*}} @_ZN13DestructorBCEC1Ei({{.*}}
+  DestructorBCE local(not_constexpr);
+  // CHECK-DTOR: call {{.*}} @_ZN13DestructorBCED
+  // CHECK-DTOR: }
+}
+
+// CHECK-DTOR-LABEL: define {{.*}} @_Z15test_dtor_bce_3v(
+void test_dtor_bce_3() {
+  // Should never call dtor for a constexpr variable.
+
+  // CHECK-DTOR-NOT: call {{.*}} @_ZN13DestructorBCEC1Ei(
+  constexpr DestructorBCE local(203);
+  // CHECK-DTOR-NOT: @_ZN13DestructorBCED
+  // CHECK-DTOR: }
+}
+
+// CHECK-DTOR-LABEL: define {{.*}} @_Z22test_dtor_bce_static_1v(
+void test_dtor_bce_static_1() {
+  // Variable is constant initialized, so BCE returns true during constant
+  // destruction.
+
+  // CHECK: store i32 301
+  // CHECK-DTOR-NOT: @_ZN13DestructorBCEC1Ei({{.*}}
+  static DestructorBCE local(301);
+  // CHECK-DTOR-NOT: @_ZN13DestructorBCED
+  // CHECK-DTOR: }
+}
+
+// CHECK-DTOR-LABEL: define {{.*}} @_Z22test_dtor_bce_static_2v(
+void test_dtor_bce_static_2() {
+  // CHECK-DTOR: call {{.*}} @_ZN13DestructorBCEC1Ei({{.*}}
+  static DestructorBCE local(not_constexpr);
+  // CHECK-DTOR: call {{.*}}atexit{{.*}} @_ZN13DestructorBCED
+  // CHECK-DTOR: }
+}
+
+// CHECK-DTOR-LABEL: define {{.*}} @_Z22test_dtor_bce_static_3v(
+void test_dtor_bce_static_3() {
+  // CHECK: store i32 303
+  // CHECK-DTOR-NOT: @_ZN13DestructorBCEC1Ei({{.*}}
+  static constexpr DestructorBCE local(303);
+  // CHECK-DTOR-NOT: @_ZN13DestructorBCED
+  // CHECK-DTOR: }
+}

From 205ecd9b79c6915a85050246c961f167b494df43 Mon Sep 17 00:00:00 2001
From: Nemanja Ivanovic <nemanja.i.ibm@gmail.com>
Date: Tue, 9 Feb 2021 06:33:48 -0600
Subject: [PATCH 081/318] [DAGCombine] Do not remove masking argument to
 FP16_TO_FP for some targets

As of commit 284f2bffc9bc5, the DAG Combiner gets rid of the masking of the
input to this node if the mask only keeps the bottom 16 bits. This is because
the underlying library function does not use the high order bits. However, on
PowerPC's ELFv2 ABI, it is the caller that is responsible for clearing the bits
from the register. Therefore, the library implementation of __gnu_h2f_ieee will
return an incorrect result if the bits aren't cleared.

This combine is desired for ARM (and possibly other targets) so this patch adds
a query to Target Lowering to check if this zeroing needs to be kept.

Fixes: https://bugs.llvm.org/show_bug.cgi?id=49092

Differential revision: https://reviews.llvm.org/D96283

(cherry picked from commit a5222aa0858a42660629c410a5b669dee16a4359)
---
 llvm/include/llvm/CodeGen/TargetLowering.h    |  4 ++
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  2 +-
 llvm/lib/Target/PowerPC/PPCISelLowering.h     |  3 ++
 .../PowerPC/handle-f16-storage-type.ll        |  4 ++
 llvm/test/CodeGen/PowerPC/pr48519.ll          |  2 +
 llvm/test/CodeGen/PowerPC/pr49092.ll          | 39 +++++++++++++++++++
 6 files changed, 53 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/pr49092.ll

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index c3221aac8eea..40115fbd2f15 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2785,6 +2785,10 @@ class TargetLoweringBase {
     return false;
   }
 
+  /// Does this target require the clearing of high-order bits in a register
+  /// passed to the fp16 to fp conversion library function.
+  virtual bool shouldKeepZExtForFP16Conv() const { return false; }
+
   //===--------------------------------------------------------------------===//
   // Runtime Library hooks
   //
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 615bea2a4905..89670d708264 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -21174,7 +21174,7 @@ SDValue DAGCombiner::visitFP16_TO_FP(SDNode *N) {
   SDValue N0 = N->getOperand(0);
 
   // fold fp16_to_fp(op & 0xffff) -> fp16_to_fp(op)
-  if (N0->getOpcode() == ISD::AND) {
+  if (!TLI.shouldKeepZExtForFP16Conv() && N0->getOpcode() == ISD::AND) {
     ConstantSDNode *AndConst = getAsNonOpaqueConstant(N0.getOperand(1));
     if (AndConst && AndConst->getAPIntValue() == 0xffff) {
       return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), N->getValueType(0),
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 477105bd03ac..0dda2c181572 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -987,6 +987,9 @@ namespace llvm {
     shouldExpandBuildVectorWithShuffles(EVT VT,
                                         unsigned DefinedValues) const override;
 
+    // Keep the zero-extensions for arguments to libcalls.
+    bool shouldKeepZExtForFP16Conv() const override { return true; }
+
     /// createFastISel - This method returns a target-specific FastISel object,
     /// or null if the target does not support "fast" instruction selection.
     FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
diff --git a/llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll b/llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll
index 9977b6b33560..ab19afa2beb5 100644
--- a/llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll
+++ b/llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll
@@ -1156,6 +1156,7 @@ define float @test_sitofp_fadd_i32(i32 %a, half* %b) #0 {
 ; P8-NEXT:    xscvsxdsp f1, f0
 ; P8-NEXT:    bl __gnu_f2h_ieee
 ; P8-NEXT:    nop
+; P8-NEXT:    clrldi r3, r3, 48
 ; P8-NEXT:    bl __gnu_h2f_ieee
 ; P8-NEXT:    nop
 ; P8-NEXT:    xsaddsp f1, f31, f1
@@ -1175,6 +1176,7 @@ define float @test_sitofp_fadd_i32(i32 %a, half* %b) #0 {
 ; CHECK-NEXT:    xscvhpdp f0, f0
 ; CHECK-NEXT:    xscvdphp f1, f1
 ; CHECK-NEXT:    mffprwz r3, f1
+; CHECK-NEXT:    clrlwi r3, r3, 16
 ; CHECK-NEXT:    mtfprwz f1, r3
 ; CHECK-NEXT:    xscvhpdp f1, f1
 ; CHECK-NEXT:    xsaddsp f1, f0, f1
@@ -1225,6 +1227,7 @@ define half @PR40273(half) #0 {
 ; P8-NEXT:    stdu r1, -32(r1)
 ; P8-NEXT:    bl __gnu_f2h_ieee
 ; P8-NEXT:    nop
+; P8-NEXT:    clrldi r3, r3, 48
 ; P8-NEXT:    bl __gnu_h2f_ieee
 ; P8-NEXT:    nop
 ; P8-NEXT:    xxlxor f0, f0, f0
@@ -1245,6 +1248,7 @@ define half @PR40273(half) #0 {
 ; CHECK-NEXT:    xscvdphp f0, f1
 ; CHECK-NEXT:    xxlxor f1, f1, f1
 ; CHECK-NEXT:    mffprwz r3, f0
+; CHECK-NEXT:    clrlwi r3, r3, 16
 ; CHECK-NEXT:    mtfprwz f0, r3
 ; CHECK-NEXT:    xscvhpdp f0, f0
 ; CHECK-NEXT:    fcmpu cr0, f0, f1
diff --git a/llvm/test/CodeGen/PowerPC/pr48519.ll b/llvm/test/CodeGen/PowerPC/pr48519.ll
index 50970cb185d8..035cc49b93e6 100644
--- a/llvm/test/CodeGen/PowerPC/pr48519.ll
+++ b/llvm/test/CodeGen/PowerPC/pr48519.ll
@@ -22,6 +22,7 @@ define void @julia__typed_vcat_20() #0 {
 ; CHECK-NEXT:    xscvsxdsp f1, f0
 ; CHECK-NEXT:    bl __gnu_f2h_ieee
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    clrldi r3, r3, 48
 ; CHECK-NEXT:    bl __gnu_h2f_ieee
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    addi r30, r30, -1
@@ -46,6 +47,7 @@ define void @julia__typed_vcat_20() #0 {
 ; CHECK-P9-NEXT:    xscvsxdsp f0, f0
 ; CHECK-P9-NEXT:    xscvdphp f0, f0
 ; CHECK-P9-NEXT:    mffprwz r3, f0
+; CHECK-P9-NEXT:    clrlwi r3, r3, 16
 ; CHECK-P9-NEXT:    mtfprwz f0, r3
 ; CHECK-P9-NEXT:    li r3, 0
 ; CHECK-P9-NEXT:    xscvhpdp f0, f0
diff --git a/llvm/test/CodeGen/PowerPC/pr49092.ll b/llvm/test/CodeGen/PowerPC/pr49092.ll
new file mode 100644
index 000000000000..2fce58418515
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/pr49092.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s
+; RUN: llc -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s \
+; RUN:   -check-prefix=CHECK-P9
+
+define dso_local half @test2(i64 %a, i64 %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    std r0, 16(r1)
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    add r3, r4, r3
+; CHECK-NEXT:    addi r3, r3, 11
+; CHECK-NEXT:    clrlwi r3, r3, 16
+; CHECK-NEXT:    bl __gnu_h2f_ieee
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    add r3, r4, r3
+; CHECK-P9-NEXT:    addi r3, r3, 11
+; CHECK-P9-NEXT:    clrlwi r3, r3, 16
+; CHECK-P9-NEXT:    mtfprwz f0, r3
+; CHECK-P9-NEXT:    xscvhpdp f1, f0
+; CHECK-P9-NEXT:    blr
+entry:
+  %add = add i64 %b, %a
+  %0 = trunc i64 %add to i16
+  %conv = add i16 %0, 11
+  %call = bitcast i16 %conv to half
+  ret half %call
+}
+attributes #0 = { nounwind }

From 34cda01e235c549b56ffe30a7b09df0414d56ea0 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Tue, 2 Feb 2021 14:40:52 +0000
Subject: [PATCH 082/318] [RISCV] Fix incorrect RVV sdiv/udiv lowering

Due to a clerical error, the sdiv operation was mapping to vdivu and
udiv to vdiv, when the opposite mapping is the correct one.

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D95869

(cherry picked from commit b4106f9c7b8c498d109301ced7bf9aca32027168)
---
 .../Target/RISCV/RISCVInstrInfoVSDPatterns.td |  4 +-
 .../CodeGen/RISCV/rvv/vdiv-sdnode-rv32.ll     | 88 +++++++++----------
 .../CodeGen/RISCV/rvv/vdiv-sdnode-rv64.ll     | 88 +++++++++----------
 .../CodeGen/RISCV/rvv/vdivu-sdnode-rv32.ll    | 88 +++++++++----------
 .../CodeGen/RISCV/rvv/vdivu-sdnode-rv64.ll    | 88 +++++++++----------
 5 files changed, 178 insertions(+), 178 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
index 79a1e6ddc8a2..dee67708bed1 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
@@ -384,8 +384,8 @@ defm "" : VPatBinarySDNode_VV_VX<mulhs, "PseudoVMULH">;
 defm "" : VPatBinarySDNode_VV_VX<mulhu, "PseudoVMULHU">;
 
 // 12.11. Vector Integer Divide Instructions
-defm "" : VPatBinarySDNode_VV_VX<sdiv, "PseudoVDIVU">;
-defm "" : VPatBinarySDNode_VV_VX<udiv, "PseudoVDIV">;
+defm "" : VPatBinarySDNode_VV_VX<udiv, "PseudoVDIVU">;
+defm "" : VPatBinarySDNode_VV_VX<sdiv, "PseudoVDIV">;
 defm "" : VPatBinarySDNode_VV_VX<urem, "PseudoVREMU">;
 defm "" : VPatBinarySDNode_VV_VX<srem, "PseudoVREM">;
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/vdiv-sdnode-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vdiv-sdnode-rv32.ll
index 239151274c4e..bbfc09d1c276 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vdiv-sdnode-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vdiv-sdnode-rv32.ll
@@ -5,7 +5,7 @@ define <vscale x 1 x i8> @vdiv_vv_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8
 ; CHECK-LABEL: vdiv_vv_nxv1i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8,mf8,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v9
+; CHECK-NEXT:    vdiv.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 1 x i8> %va, %vb
   ret <vscale x 1 x i8> %vc
@@ -15,7 +15,7 @@ define <vscale x 1 x i8> @vdiv_vx_nxv1i8(<vscale x 1 x i8> %va, i8 signext %b) {
 ; CHECK-LABEL: vdiv_vx_nxv1i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8,mf8,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 1 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <vscale x 1 x i8> %head, <vscale x 1 x i8> undef, <vscale x 1 x i32> zeroinitializer
@@ -70,7 +70,7 @@ define <vscale x 2 x i8> @vdiv_vv_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8
 ; CHECK-LABEL: vdiv_vv_nxv2i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8,mf4,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v9
+; CHECK-NEXT:    vdiv.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 2 x i8> %va, %vb
   ret <vscale x 2 x i8> %vc
@@ -80,7 +80,7 @@ define <vscale x 2 x i8> @vdiv_vx_nxv2i8(<vscale x 2 x i8> %va, i8 signext %b) {
 ; CHECK-LABEL: vdiv_vx_nxv2i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8,mf4,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <vscale x 2 x i8> %head, <vscale x 2 x i8> undef, <vscale x 2 x i32> zeroinitializer
@@ -111,7 +111,7 @@ define <vscale x 4 x i8> @vdiv_vv_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8
 ; CHECK-LABEL: vdiv_vv_nxv4i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8,mf2,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v9
+; CHECK-NEXT:    vdiv.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 4 x i8> %va, %vb
   ret <vscale x 4 x i8> %vc
@@ -121,7 +121,7 @@ define <vscale x 4 x i8> @vdiv_vx_nxv4i8(<vscale x 4 x i8> %va, i8 signext %b) {
 ; CHECK-LABEL: vdiv_vx_nxv4i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8,mf2,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <vscale x 4 x i8> %head, <vscale x 4 x i8> undef, <vscale x 4 x i32> zeroinitializer
@@ -152,7 +152,7 @@ define <vscale x 8 x i8> @vdiv_vv_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8
 ; CHECK-LABEL: vdiv_vv_nxv8i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8,m1,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v9
+; CHECK-NEXT:    vdiv.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 8 x i8> %va, %vb
   ret <vscale x 8 x i8> %vc
@@ -162,7 +162,7 @@ define <vscale x 8 x i8> @vdiv_vx_nxv8i8(<vscale x 8 x i8> %va, i8 signext %b) {
 ; CHECK-LABEL: vdiv_vx_nxv8i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8,m1,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <vscale x 8 x i8> %head, <vscale x 8 x i8> undef, <vscale x 8 x i32> zeroinitializer
@@ -193,7 +193,7 @@ define <vscale x 16 x i8> @vdiv_vv_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16
 ; CHECK-LABEL: vdiv_vv_nxv16i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8,m2,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v10
+; CHECK-NEXT:    vdiv.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 16 x i8> %va, %vb
   ret <vscale x 16 x i8> %vc
@@ -203,7 +203,7 @@ define <vscale x 16 x i8> @vdiv_vx_nxv16i8(<vscale x 16 x i8> %va, i8 signext %b
 ; CHECK-LABEL: vdiv_vx_nxv16i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8,m2,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <vscale x 16 x i8> %head, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
@@ -234,7 +234,7 @@ define <vscale x 32 x i8> @vdiv_vv_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32
 ; CHECK-LABEL: vdiv_vv_nxv32i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8,m4,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v12
+; CHECK-NEXT:    vdiv.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 32 x i8> %va, %vb
   ret <vscale x 32 x i8> %vc
@@ -244,7 +244,7 @@ define <vscale x 32 x i8> @vdiv_vx_nxv32i8(<vscale x 32 x i8> %va, i8 signext %b
 ; CHECK-LABEL: vdiv_vx_nxv32i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8,m4,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 32 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <vscale x 32 x i8> %head, <vscale x 32 x i8> undef, <vscale x 32 x i32> zeroinitializer
@@ -275,7 +275,7 @@ define <vscale x 64 x i8> @vdiv_vv_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64
 ; CHECK-LABEL: vdiv_vv_nxv64i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8,m8,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v16
+; CHECK-NEXT:    vdiv.vv v8, v8, v16
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 64 x i8> %va, %vb
   ret <vscale x 64 x i8> %vc
@@ -285,7 +285,7 @@ define <vscale x 64 x i8> @vdiv_vx_nxv64i8(<vscale x 64 x i8> %va, i8 signext %b
 ; CHECK-LABEL: vdiv_vx_nxv64i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8,m8,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 64 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <vscale x 64 x i8> %head, <vscale x 64 x i8> undef, <vscale x 64 x i32> zeroinitializer
@@ -316,7 +316,7 @@ define <vscale x 1 x i16> @vdiv_vv_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x
 ; CHECK-LABEL: vdiv_vv_nxv1i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16,mf4,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v9
+; CHECK-NEXT:    vdiv.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 1 x i16> %va, %vb
   ret <vscale x 1 x i16> %vc
@@ -326,7 +326,7 @@ define <vscale x 1 x i16> @vdiv_vx_nxv1i16(<vscale x 1 x i16> %va, i16 signext %
 ; CHECK-LABEL: vdiv_vx_nxv1i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16,mf4,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 1 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <vscale x 1 x i16> %head, <vscale x 1 x i16> undef, <vscale x 1 x i32> zeroinitializer
@@ -357,7 +357,7 @@ define <vscale x 2 x i16> @vdiv_vv_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x
 ; CHECK-LABEL: vdiv_vv_nxv2i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16,mf2,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v9
+; CHECK-NEXT:    vdiv.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 2 x i16> %va, %vb
   ret <vscale x 2 x i16> %vc
@@ -367,7 +367,7 @@ define <vscale x 2 x i16> @vdiv_vx_nxv2i16(<vscale x 2 x i16> %va, i16 signext %
 ; CHECK-LABEL: vdiv_vx_nxv2i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16,mf2,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <vscale x 2 x i16> %head, <vscale x 2 x i16> undef, <vscale x 2 x i32> zeroinitializer
@@ -398,7 +398,7 @@ define <vscale x 4 x i16> @vdiv_vv_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x
 ; CHECK-LABEL: vdiv_vv_nxv4i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16,m1,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v9
+; CHECK-NEXT:    vdiv.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 4 x i16> %va, %vb
   ret <vscale x 4 x i16> %vc
@@ -408,7 +408,7 @@ define <vscale x 4 x i16> @vdiv_vx_nxv4i16(<vscale x 4 x i16> %va, i16 signext %
 ; CHECK-LABEL: vdiv_vx_nxv4i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16,m1,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <vscale x 4 x i16> %head, <vscale x 4 x i16> undef, <vscale x 4 x i32> zeroinitializer
@@ -439,7 +439,7 @@ define <vscale x 8 x i16> @vdiv_vv_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x
 ; CHECK-LABEL: vdiv_vv_nxv8i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16,m2,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v10
+; CHECK-NEXT:    vdiv.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 8 x i16> %va, %vb
   ret <vscale x 8 x i16> %vc
@@ -449,7 +449,7 @@ define <vscale x 8 x i16> @vdiv_vx_nxv8i16(<vscale x 8 x i16> %va, i16 signext %
 ; CHECK-LABEL: vdiv_vx_nxv8i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16,m2,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <vscale x 8 x i16> %head, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
@@ -480,7 +480,7 @@ define <vscale x 16 x i16> @vdiv_vv_nxv16i16(<vscale x 16 x i16> %va, <vscale x
 ; CHECK-LABEL: vdiv_vv_nxv16i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16,m4,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v12
+; CHECK-NEXT:    vdiv.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 16 x i16> %va, %vb
   ret <vscale x 16 x i16> %vc
@@ -490,7 +490,7 @@ define <vscale x 16 x i16> @vdiv_vx_nxv16i16(<vscale x 16 x i16> %va, i16 signex
 ; CHECK-LABEL: vdiv_vx_nxv16i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16,m4,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <vscale x 16 x i16> %head, <vscale x 16 x i16> undef, <vscale x 16 x i32> zeroinitializer
@@ -521,7 +521,7 @@ define <vscale x 32 x i16> @vdiv_vv_nxv32i16(<vscale x 32 x i16> %va, <vscale x
 ; CHECK-LABEL: vdiv_vv_nxv32i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16,m8,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v16
+; CHECK-NEXT:    vdiv.vv v8, v8, v16
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 32 x i16> %va, %vb
   ret <vscale x 32 x i16> %vc
@@ -531,7 +531,7 @@ define <vscale x 32 x i16> @vdiv_vx_nxv32i16(<vscale x 32 x i16> %va, i16 signex
 ; CHECK-LABEL: vdiv_vx_nxv32i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16,m8,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 32 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <vscale x 32 x i16> %head, <vscale x 32 x i16> undef, <vscale x 32 x i32> zeroinitializer
@@ -562,7 +562,7 @@ define <vscale x 1 x i32> @vdiv_vv_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x
 ; CHECK-LABEL: vdiv_vv_nxv1i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32,mf2,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v9
+; CHECK-NEXT:    vdiv.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 1 x i32> %va, %vb
   ret <vscale x 1 x i32> %vc
@@ -572,7 +572,7 @@ define <vscale x 1 x i32> @vdiv_vx_nxv1i32(<vscale x 1 x i32> %va, i32 %b) {
 ; CHECK-LABEL: vdiv_vx_nxv1i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32,mf2,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 1 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <vscale x 1 x i32> %head, <vscale x 1 x i32> undef, <vscale x 1 x i32> zeroinitializer
@@ -603,7 +603,7 @@ define <vscale x 2 x i32> @vdiv_vv_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x
 ; CHECK-LABEL: vdiv_vv_nxv2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32,m1,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v9
+; CHECK-NEXT:    vdiv.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 2 x i32> %va, %vb
   ret <vscale x 2 x i32> %vc
@@ -613,7 +613,7 @@ define <vscale x 2 x i32> @vdiv_vx_nxv2i32(<vscale x 2 x i32> %va, i32 %b) {
 ; CHECK-LABEL: vdiv_vx_nxv2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32,m1,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <vscale x 2 x i32> %head, <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer
@@ -644,7 +644,7 @@ define <vscale x 4 x i32> @vdiv_vv_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x
 ; CHECK-LABEL: vdiv_vv_nxv4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32,m2,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v10
+; CHECK-NEXT:    vdiv.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 4 x i32> %va, %vb
   ret <vscale x 4 x i32> %vc
@@ -654,7 +654,7 @@ define <vscale x 4 x i32> @vdiv_vx_nxv4i32(<vscale x 4 x i32> %va, i32 %b) {
 ; CHECK-LABEL: vdiv_vx_nxv4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32,m2,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <vscale x 4 x i32> %head, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
@@ -685,7 +685,7 @@ define <vscale x 8 x i32> @vdiv_vv_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x
 ; CHECK-LABEL: vdiv_vv_nxv8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32,m4,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v12
+; CHECK-NEXT:    vdiv.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 8 x i32> %va, %vb
   ret <vscale x 8 x i32> %vc
@@ -695,7 +695,7 @@ define <vscale x 8 x i32> @vdiv_vx_nxv8i32(<vscale x 8 x i32> %va, i32 %b) {
 ; CHECK-LABEL: vdiv_vx_nxv8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32,m4,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <vscale x 8 x i32> %head, <vscale x 8 x i32> undef, <vscale x 8 x i32> zeroinitializer
@@ -726,7 +726,7 @@ define <vscale x 16 x i32> @vdiv_vv_nxv16i32(<vscale x 16 x i32> %va, <vscale x
 ; CHECK-LABEL: vdiv_vv_nxv16i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32,m8,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v16
+; CHECK-NEXT:    vdiv.vv v8, v8, v16
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 16 x i32> %va, %vb
   ret <vscale x 16 x i32> %vc
@@ -736,7 +736,7 @@ define <vscale x 16 x i32> @vdiv_vx_nxv16i32(<vscale x 16 x i32> %va, i32 %b) {
 ; CHECK-LABEL: vdiv_vx_nxv16i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32,m8,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <vscale x 16 x i32> %head, <vscale x 16 x i32> undef, <vscale x 16 x i32> zeroinitializer
@@ -767,7 +767,7 @@ define <vscale x 1 x i64> @vdiv_vv_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x
 ; CHECK-LABEL: vdiv_vv_nxv1i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64,m1,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v9
+; CHECK-NEXT:    vdiv.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 1 x i64> %va, %vb
   ret <vscale x 1 x i64> %vc
@@ -784,7 +784,7 @@ define <vscale x 1 x i64> @vdiv_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b) {
 ; CHECK-NEXT:    vsll.vx v26, v26, a1
 ; CHECK-NEXT:    vsrl.vx v26, v26, a1
 ; CHECK-NEXT:    vor.vv v25, v26, v25
-; CHECK-NEXT:    vdivu.vv v8, v8, v25
+; CHECK-NEXT:    vdiv.vv v8, v8, v25
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 1 x i64> undef, i64 %b, i32 0
   %splat = shufflevector <vscale x 1 x i64> %head, <vscale x 1 x i64> undef, <vscale x 1 x i32> zeroinitializer
@@ -825,7 +825,7 @@ define <vscale x 2 x i64> @vdiv_vv_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x
 ; CHECK-LABEL: vdiv_vv_nxv2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64,m2,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v10
+; CHECK-NEXT:    vdiv.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 2 x i64> %va, %vb
   ret <vscale x 2 x i64> %vc
@@ -842,7 +842,7 @@ define <vscale x 2 x i64> @vdiv_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b) {
 ; CHECK-NEXT:    vsll.vx v28, v28, a1
 ; CHECK-NEXT:    vsrl.vx v28, v28, a1
 ; CHECK-NEXT:    vor.vv v26, v28, v26
-; CHECK-NEXT:    vdivu.vv v8, v8, v26
+; CHECK-NEXT:    vdiv.vv v8, v8, v26
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x i64> undef, i64 %b, i32 0
   %splat = shufflevector <vscale x 2 x i64> %head, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
@@ -883,7 +883,7 @@ define <vscale x 4 x i64> @vdiv_vv_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x
 ; CHECK-LABEL: vdiv_vv_nxv4i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64,m4,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v12
+; CHECK-NEXT:    vdiv.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 4 x i64> %va, %vb
   ret <vscale x 4 x i64> %vc
@@ -900,7 +900,7 @@ define <vscale x 4 x i64> @vdiv_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b) {
 ; CHECK-NEXT:    vsll.vx v12, v12, a1
 ; CHECK-NEXT:    vsrl.vx v12, v12, a1
 ; CHECK-NEXT:    vor.vv v28, v12, v28
-; CHECK-NEXT:    vdivu.vv v8, v8, v28
+; CHECK-NEXT:    vdiv.vv v8, v8, v28
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x i64> undef, i64 %b, i32 0
   %splat = shufflevector <vscale x 4 x i64> %head, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
@@ -941,7 +941,7 @@ define <vscale x 8 x i64> @vdiv_vv_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x
 ; CHECK-LABEL: vdiv_vv_nxv8i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64,m8,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v16
+; CHECK-NEXT:    vdiv.vv v8, v8, v16
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 8 x i64> %va, %vb
   ret <vscale x 8 x i64> %vc
@@ -958,7 +958,7 @@ define <vscale x 8 x i64> @vdiv_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b) {
 ; CHECK-NEXT:    vsll.vx v24, v24, a1
 ; CHECK-NEXT:    vsrl.vx v24, v24, a1
 ; CHECK-NEXT:    vor.vv v16, v24, v16
-; CHECK-NEXT:    vdivu.vv v8, v8, v16
+; CHECK-NEXT:    vdiv.vv v8, v8, v16
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x i64> undef, i64 %b, i32 0
   %splat = shufflevector <vscale x 8 x i64> %head, <vscale x 8 x i64> undef, <vscale x 8 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vdiv-sdnode-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vdiv-sdnode-rv64.ll
index 991cccf72cdd..b8f331e78b5b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vdiv-sdnode-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vdiv-sdnode-rv64.ll
@@ -5,7 +5,7 @@ define <vscale x 1 x i8> @vdiv_vv_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8
 ; CHECK-LABEL: vdiv_vv_nxv1i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8,mf8,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v9
+; CHECK-NEXT:    vdiv.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 1 x i8> %va, %vb
   ret <vscale x 1 x i8> %vc
@@ -15,7 +15,7 @@ define <vscale x 1 x i8> @vdiv_vx_nxv1i8(<vscale x 1 x i8> %va, i8 signext %b) {
 ; CHECK-LABEL: vdiv_vx_nxv1i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8,mf8,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 1 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <vscale x 1 x i8> %head, <vscale x 1 x i8> undef, <vscale x 1 x i32> zeroinitializer
@@ -46,7 +46,7 @@ define <vscale x 2 x i8> @vdiv_vv_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8
 ; CHECK-LABEL: vdiv_vv_nxv2i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8,mf4,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v9
+; CHECK-NEXT:    vdiv.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 2 x i8> %va, %vb
   ret <vscale x 2 x i8> %vc
@@ -56,7 +56,7 @@ define <vscale x 2 x i8> @vdiv_vx_nxv2i8(<vscale x 2 x i8> %va, i8 signext %b) {
 ; CHECK-LABEL: vdiv_vx_nxv2i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8,mf4,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <vscale x 2 x i8> %head, <vscale x 2 x i8> undef, <vscale x 2 x i32> zeroinitializer
@@ -87,7 +87,7 @@ define <vscale x 4 x i8> @vdiv_vv_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8
 ; CHECK-LABEL: vdiv_vv_nxv4i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8,mf2,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v9
+; CHECK-NEXT:    vdiv.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 4 x i8> %va, %vb
   ret <vscale x 4 x i8> %vc
@@ -97,7 +97,7 @@ define <vscale x 4 x i8> @vdiv_vx_nxv4i8(<vscale x 4 x i8> %va, i8 signext %b) {
 ; CHECK-LABEL: vdiv_vx_nxv4i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8,mf2,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <vscale x 4 x i8> %head, <vscale x 4 x i8> undef, <vscale x 4 x i32> zeroinitializer
@@ -128,7 +128,7 @@ define <vscale x 8 x i8> @vdiv_vv_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8
 ; CHECK-LABEL: vdiv_vv_nxv8i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8,m1,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v9
+; CHECK-NEXT:    vdiv.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 8 x i8> %va, %vb
   ret <vscale x 8 x i8> %vc
@@ -138,7 +138,7 @@ define <vscale x 8 x i8> @vdiv_vx_nxv8i8(<vscale x 8 x i8> %va, i8 signext %b) {
 ; CHECK-LABEL: vdiv_vx_nxv8i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8,m1,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <vscale x 8 x i8> %head, <vscale x 8 x i8> undef, <vscale x 8 x i32> zeroinitializer
@@ -169,7 +169,7 @@ define <vscale x 16 x i8> @vdiv_vv_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16
 ; CHECK-LABEL: vdiv_vv_nxv16i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8,m2,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v10
+; CHECK-NEXT:    vdiv.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 16 x i8> %va, %vb
   ret <vscale x 16 x i8> %vc
@@ -179,7 +179,7 @@ define <vscale x 16 x i8> @vdiv_vx_nxv16i8(<vscale x 16 x i8> %va, i8 signext %b
 ; CHECK-LABEL: vdiv_vx_nxv16i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8,m2,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <vscale x 16 x i8> %head, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
@@ -210,7 +210,7 @@ define <vscale x 32 x i8> @vdiv_vv_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32
 ; CHECK-LABEL: vdiv_vv_nxv32i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8,m4,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v12
+; CHECK-NEXT:    vdiv.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 32 x i8> %va, %vb
   ret <vscale x 32 x i8> %vc
@@ -220,7 +220,7 @@ define <vscale x 32 x i8> @vdiv_vx_nxv32i8(<vscale x 32 x i8> %va, i8 signext %b
 ; CHECK-LABEL: vdiv_vx_nxv32i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8,m4,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 32 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <vscale x 32 x i8> %head, <vscale x 32 x i8> undef, <vscale x 32 x i32> zeroinitializer
@@ -251,7 +251,7 @@ define <vscale x 64 x i8> @vdiv_vv_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64
 ; CHECK-LABEL: vdiv_vv_nxv64i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8,m8,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v16
+; CHECK-NEXT:    vdiv.vv v8, v8, v16
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 64 x i8> %va, %vb
   ret <vscale x 64 x i8> %vc
@@ -261,7 +261,7 @@ define <vscale x 64 x i8> @vdiv_vx_nxv64i8(<vscale x 64 x i8> %va, i8 signext %b
 ; CHECK-LABEL: vdiv_vx_nxv64i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8,m8,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 64 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <vscale x 64 x i8> %head, <vscale x 64 x i8> undef, <vscale x 64 x i32> zeroinitializer
@@ -292,7 +292,7 @@ define <vscale x 1 x i16> @vdiv_vv_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x
 ; CHECK-LABEL: vdiv_vv_nxv1i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16,mf4,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v9
+; CHECK-NEXT:    vdiv.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 1 x i16> %va, %vb
   ret <vscale x 1 x i16> %vc
@@ -302,7 +302,7 @@ define <vscale x 1 x i16> @vdiv_vx_nxv1i16(<vscale x 1 x i16> %va, i16 signext %
 ; CHECK-LABEL: vdiv_vx_nxv1i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16,mf4,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 1 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <vscale x 1 x i16> %head, <vscale x 1 x i16> undef, <vscale x 1 x i32> zeroinitializer
@@ -333,7 +333,7 @@ define <vscale x 2 x i16> @vdiv_vv_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x
 ; CHECK-LABEL: vdiv_vv_nxv2i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16,mf2,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v9
+; CHECK-NEXT:    vdiv.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 2 x i16> %va, %vb
   ret <vscale x 2 x i16> %vc
@@ -343,7 +343,7 @@ define <vscale x 2 x i16> @vdiv_vx_nxv2i16(<vscale x 2 x i16> %va, i16 signext %
 ; CHECK-LABEL: vdiv_vx_nxv2i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16,mf2,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <vscale x 2 x i16> %head, <vscale x 2 x i16> undef, <vscale x 2 x i32> zeroinitializer
@@ -374,7 +374,7 @@ define <vscale x 4 x i16> @vdiv_vv_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x
 ; CHECK-LABEL: vdiv_vv_nxv4i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16,m1,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v9
+; CHECK-NEXT:    vdiv.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 4 x i16> %va, %vb
   ret <vscale x 4 x i16> %vc
@@ -384,7 +384,7 @@ define <vscale x 4 x i16> @vdiv_vx_nxv4i16(<vscale x 4 x i16> %va, i16 signext %
 ; CHECK-LABEL: vdiv_vx_nxv4i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16,m1,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <vscale x 4 x i16> %head, <vscale x 4 x i16> undef, <vscale x 4 x i32> zeroinitializer
@@ -415,7 +415,7 @@ define <vscale x 8 x i16> @vdiv_vv_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x
 ; CHECK-LABEL: vdiv_vv_nxv8i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16,m2,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v10
+; CHECK-NEXT:    vdiv.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 8 x i16> %va, %vb
   ret <vscale x 8 x i16> %vc
@@ -425,7 +425,7 @@ define <vscale x 8 x i16> @vdiv_vx_nxv8i16(<vscale x 8 x i16> %va, i16 signext %
 ; CHECK-LABEL: vdiv_vx_nxv8i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16,m2,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <vscale x 8 x i16> %head, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
@@ -456,7 +456,7 @@ define <vscale x 16 x i16> @vdiv_vv_nxv16i16(<vscale x 16 x i16> %va, <vscale x
 ; CHECK-LABEL: vdiv_vv_nxv16i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16,m4,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v12
+; CHECK-NEXT:    vdiv.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 16 x i16> %va, %vb
   ret <vscale x 16 x i16> %vc
@@ -466,7 +466,7 @@ define <vscale x 16 x i16> @vdiv_vx_nxv16i16(<vscale x 16 x i16> %va, i16 signex
 ; CHECK-LABEL: vdiv_vx_nxv16i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16,m4,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <vscale x 16 x i16> %head, <vscale x 16 x i16> undef, <vscale x 16 x i32> zeroinitializer
@@ -497,7 +497,7 @@ define <vscale x 32 x i16> @vdiv_vv_nxv32i16(<vscale x 32 x i16> %va, <vscale x
 ; CHECK-LABEL: vdiv_vv_nxv32i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16,m8,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v16
+; CHECK-NEXT:    vdiv.vv v8, v8, v16
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 32 x i16> %va, %vb
   ret <vscale x 32 x i16> %vc
@@ -507,7 +507,7 @@ define <vscale x 32 x i16> @vdiv_vx_nxv32i16(<vscale x 32 x i16> %va, i16 signex
 ; CHECK-LABEL: vdiv_vx_nxv32i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16,m8,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 32 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <vscale x 32 x i16> %head, <vscale x 32 x i16> undef, <vscale x 32 x i32> zeroinitializer
@@ -538,7 +538,7 @@ define <vscale x 1 x i32> @vdiv_vv_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x
 ; CHECK-LABEL: vdiv_vv_nxv1i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32,mf2,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v9
+; CHECK-NEXT:    vdiv.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 1 x i32> %va, %vb
   ret <vscale x 1 x i32> %vc
@@ -548,7 +548,7 @@ define <vscale x 1 x i32> @vdiv_vx_nxv1i32(<vscale x 1 x i32> %va, i32 signext %
 ; CHECK-LABEL: vdiv_vx_nxv1i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32,mf2,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 1 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <vscale x 1 x i32> %head, <vscale x 1 x i32> undef, <vscale x 1 x i32> zeroinitializer
@@ -580,7 +580,7 @@ define <vscale x 2 x i32> @vdiv_vv_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x
 ; CHECK-LABEL: vdiv_vv_nxv2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32,m1,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v9
+; CHECK-NEXT:    vdiv.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 2 x i32> %va, %vb
   ret <vscale x 2 x i32> %vc
@@ -590,7 +590,7 @@ define <vscale x 2 x i32> @vdiv_vx_nxv2i32(<vscale x 2 x i32> %va, i32 signext %
 ; CHECK-LABEL: vdiv_vx_nxv2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32,m1,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <vscale x 2 x i32> %head, <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer
@@ -622,7 +622,7 @@ define <vscale x 4 x i32> @vdiv_vv_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x
 ; CHECK-LABEL: vdiv_vv_nxv4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32,m2,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v10
+; CHECK-NEXT:    vdiv.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 4 x i32> %va, %vb
   ret <vscale x 4 x i32> %vc
@@ -632,7 +632,7 @@ define <vscale x 4 x i32> @vdiv_vx_nxv4i32(<vscale x 4 x i32> %va, i32 signext %
 ; CHECK-LABEL: vdiv_vx_nxv4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32,m2,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <vscale x 4 x i32> %head, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
@@ -664,7 +664,7 @@ define <vscale x 8 x i32> @vdiv_vv_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x
 ; CHECK-LABEL: vdiv_vv_nxv8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32,m4,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v12
+; CHECK-NEXT:    vdiv.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 8 x i32> %va, %vb
   ret <vscale x 8 x i32> %vc
@@ -674,7 +674,7 @@ define <vscale x 8 x i32> @vdiv_vx_nxv8i32(<vscale x 8 x i32> %va, i32 signext %
 ; CHECK-LABEL: vdiv_vx_nxv8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32,m4,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <vscale x 8 x i32> %head, <vscale x 8 x i32> undef, <vscale x 8 x i32> zeroinitializer
@@ -706,7 +706,7 @@ define <vscale x 16 x i32> @vdiv_vv_nxv16i32(<vscale x 16 x i32> %va, <vscale x
 ; CHECK-LABEL: vdiv_vv_nxv16i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32,m8,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v16
+; CHECK-NEXT:    vdiv.vv v8, v8, v16
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 16 x i32> %va, %vb
   ret <vscale x 16 x i32> %vc
@@ -716,7 +716,7 @@ define <vscale x 16 x i32> @vdiv_vx_nxv16i32(<vscale x 16 x i32> %va, i32 signex
 ; CHECK-LABEL: vdiv_vx_nxv16i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32,m8,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <vscale x 16 x i32> %head, <vscale x 16 x i32> undef, <vscale x 16 x i32> zeroinitializer
@@ -748,7 +748,7 @@ define <vscale x 1 x i64> @vdiv_vv_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x
 ; CHECK-LABEL: vdiv_vv_nxv1i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64,m1,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v9
+; CHECK-NEXT:    vdiv.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 1 x i64> %va, %vb
   ret <vscale x 1 x i64> %vc
@@ -758,7 +758,7 @@ define <vscale x 1 x i64> @vdiv_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b) {
 ; CHECK-LABEL: vdiv_vx_nxv1i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e64,m1,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 1 x i64> undef, i64 %b, i32 0
   %splat = shufflevector <vscale x 1 x i64> %head, <vscale x 1 x i64> undef, <vscale x 1 x i32> zeroinitializer
@@ -796,7 +796,7 @@ define <vscale x 2 x i64> @vdiv_vv_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x
 ; CHECK-LABEL: vdiv_vv_nxv2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64,m2,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v10
+; CHECK-NEXT:    vdiv.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 2 x i64> %va, %vb
   ret <vscale x 2 x i64> %vc
@@ -806,7 +806,7 @@ define <vscale x 2 x i64> @vdiv_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b) {
 ; CHECK-LABEL: vdiv_vx_nxv2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e64,m2,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x i64> undef, i64 %b, i32 0
   %splat = shufflevector <vscale x 2 x i64> %head, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
@@ -844,7 +844,7 @@ define <vscale x 4 x i64> @vdiv_vv_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x
 ; CHECK-LABEL: vdiv_vv_nxv4i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64,m4,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v12
+; CHECK-NEXT:    vdiv.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 4 x i64> %va, %vb
   ret <vscale x 4 x i64> %vc
@@ -854,7 +854,7 @@ define <vscale x 4 x i64> @vdiv_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b) {
 ; CHECK-LABEL: vdiv_vx_nxv4i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e64,m4,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x i64> undef, i64 %b, i32 0
   %splat = shufflevector <vscale x 4 x i64> %head, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
@@ -892,7 +892,7 @@ define <vscale x 8 x i64> @vdiv_vv_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x
 ; CHECK-LABEL: vdiv_vv_nxv8i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64,m8,ta,mu
-; CHECK-NEXT:    vdivu.vv v8, v8, v16
+; CHECK-NEXT:    vdiv.vv v8, v8, v16
 ; CHECK-NEXT:    ret
   %vc = sdiv <vscale x 8 x i64> %va, %vb
   ret <vscale x 8 x i64> %vc
@@ -902,7 +902,7 @@ define <vscale x 8 x i64> @vdiv_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b) {
 ; CHECK-LABEL: vdiv_vx_nxv8i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e64,m8,ta,mu
-; CHECK-NEXT:    vdivu.vx v8, v8, a0
+; CHECK-NEXT:    vdiv.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x i64> undef, i64 %b, i32 0
   %splat = shufflevector <vscale x 8 x i64> %head, <vscale x 8 x i64> undef, <vscale x 8 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vdivu-sdnode-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vdivu-sdnode-rv32.ll
index 27b27cd64bae..383d3f380fe8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vdivu-sdnode-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vdivu-sdnode-rv32.ll
@@ -5,7 +5,7 @@ define <vscale x 1 x i8> @vdivu_vv_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i
 ; CHECK-LABEL: vdivu_vv_nxv1i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8,mf8,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v9
+; CHECK-NEXT:    vdivu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 1 x i8> %va, %vb
   ret <vscale x 1 x i8> %vc
@@ -15,7 +15,7 @@ define <vscale x 1 x i8> @vdivu_vx_nxv1i8(<vscale x 1 x i8> %va, i8 signext %b)
 ; CHECK-LABEL: vdivu_vx_nxv1i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8,mf8,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 1 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <vscale x 1 x i8> %head, <vscale x 1 x i8> undef, <vscale x 1 x i32> zeroinitializer
@@ -68,7 +68,7 @@ define <vscale x 2 x i8> @vdivu_vv_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i
 ; CHECK-LABEL: vdivu_vv_nxv2i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8,mf4,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v9
+; CHECK-NEXT:    vdivu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 2 x i8> %va, %vb
   ret <vscale x 2 x i8> %vc
@@ -78,7 +78,7 @@ define <vscale x 2 x i8> @vdivu_vx_nxv2i8(<vscale x 2 x i8> %va, i8 signext %b)
 ; CHECK-LABEL: vdivu_vx_nxv2i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8,mf4,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <vscale x 2 x i8> %head, <vscale x 2 x i8> undef, <vscale x 2 x i32> zeroinitializer
@@ -107,7 +107,7 @@ define <vscale x 4 x i8> @vdivu_vv_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i
 ; CHECK-LABEL: vdivu_vv_nxv4i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8,mf2,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v9
+; CHECK-NEXT:    vdivu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 4 x i8> %va, %vb
   ret <vscale x 4 x i8> %vc
@@ -117,7 +117,7 @@ define <vscale x 4 x i8> @vdivu_vx_nxv4i8(<vscale x 4 x i8> %va, i8 signext %b)
 ; CHECK-LABEL: vdivu_vx_nxv4i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8,mf2,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <vscale x 4 x i8> %head, <vscale x 4 x i8> undef, <vscale x 4 x i32> zeroinitializer
@@ -146,7 +146,7 @@ define <vscale x 8 x i8> @vdivu_vv_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i
 ; CHECK-LABEL: vdivu_vv_nxv8i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8,m1,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v9
+; CHECK-NEXT:    vdivu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 8 x i8> %va, %vb
   ret <vscale x 8 x i8> %vc
@@ -156,7 +156,7 @@ define <vscale x 8 x i8> @vdivu_vx_nxv8i8(<vscale x 8 x i8> %va, i8 signext %b)
 ; CHECK-LABEL: vdivu_vx_nxv8i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8,m1,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <vscale x 8 x i8> %head, <vscale x 8 x i8> undef, <vscale x 8 x i32> zeroinitializer
@@ -185,7 +185,7 @@ define <vscale x 16 x i8> @vdivu_vv_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16
 ; CHECK-LABEL: vdivu_vv_nxv16i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8,m2,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v10
+; CHECK-NEXT:    vdivu.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 16 x i8> %va, %vb
   ret <vscale x 16 x i8> %vc
@@ -195,7 +195,7 @@ define <vscale x 16 x i8> @vdivu_vx_nxv16i8(<vscale x 16 x i8> %va, i8 signext %
 ; CHECK-LABEL: vdivu_vx_nxv16i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8,m2,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <vscale x 16 x i8> %head, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
@@ -224,7 +224,7 @@ define <vscale x 32 x i8> @vdivu_vv_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32
 ; CHECK-LABEL: vdivu_vv_nxv32i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8,m4,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v12
+; CHECK-NEXT:    vdivu.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 32 x i8> %va, %vb
   ret <vscale x 32 x i8> %vc
@@ -234,7 +234,7 @@ define <vscale x 32 x i8> @vdivu_vx_nxv32i8(<vscale x 32 x i8> %va, i8 signext %
 ; CHECK-LABEL: vdivu_vx_nxv32i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8,m4,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 32 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <vscale x 32 x i8> %head, <vscale x 32 x i8> undef, <vscale x 32 x i32> zeroinitializer
@@ -263,7 +263,7 @@ define <vscale x 64 x i8> @vdivu_vv_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64
 ; CHECK-LABEL: vdivu_vv_nxv64i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8,m8,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v16
+; CHECK-NEXT:    vdivu.vv v8, v8, v16
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 64 x i8> %va, %vb
   ret <vscale x 64 x i8> %vc
@@ -273,7 +273,7 @@ define <vscale x 64 x i8> @vdivu_vx_nxv64i8(<vscale x 64 x i8> %va, i8 signext %
 ; CHECK-LABEL: vdivu_vx_nxv64i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8,m8,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 64 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <vscale x 64 x i8> %head, <vscale x 64 x i8> undef, <vscale x 64 x i32> zeroinitializer
@@ -302,7 +302,7 @@ define <vscale x 1 x i16> @vdivu_vv_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1
 ; CHECK-LABEL: vdivu_vv_nxv1i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16,mf4,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v9
+; CHECK-NEXT:    vdivu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 1 x i16> %va, %vb
   ret <vscale x 1 x i16> %vc
@@ -312,7 +312,7 @@ define <vscale x 1 x i16> @vdivu_vx_nxv1i16(<vscale x 1 x i16> %va, i16 signext
 ; CHECK-LABEL: vdivu_vx_nxv1i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16,mf4,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 1 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <vscale x 1 x i16> %head, <vscale x 1 x i16> undef, <vscale x 1 x i32> zeroinitializer
@@ -342,7 +342,7 @@ define <vscale x 2 x i16> @vdivu_vv_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2
 ; CHECK-LABEL: vdivu_vv_nxv2i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16,mf2,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v9
+; CHECK-NEXT:    vdivu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 2 x i16> %va, %vb
   ret <vscale x 2 x i16> %vc
@@ -352,7 +352,7 @@ define <vscale x 2 x i16> @vdivu_vx_nxv2i16(<vscale x 2 x i16> %va, i16 signext
 ; CHECK-LABEL: vdivu_vx_nxv2i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16,mf2,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <vscale x 2 x i16> %head, <vscale x 2 x i16> undef, <vscale x 2 x i32> zeroinitializer
@@ -382,7 +382,7 @@ define <vscale x 4 x i16> @vdivu_vv_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4
 ; CHECK-LABEL: vdivu_vv_nxv4i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16,m1,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v9
+; CHECK-NEXT:    vdivu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 4 x i16> %va, %vb
   ret <vscale x 4 x i16> %vc
@@ -392,7 +392,7 @@ define <vscale x 4 x i16> @vdivu_vx_nxv4i16(<vscale x 4 x i16> %va, i16 signext
 ; CHECK-LABEL: vdivu_vx_nxv4i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16,m1,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <vscale x 4 x i16> %head, <vscale x 4 x i16> undef, <vscale x 4 x i32> zeroinitializer
@@ -422,7 +422,7 @@ define <vscale x 8 x i16> @vdivu_vv_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8
 ; CHECK-LABEL: vdivu_vv_nxv8i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16,m2,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v10
+; CHECK-NEXT:    vdivu.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 8 x i16> %va, %vb
   ret <vscale x 8 x i16> %vc
@@ -432,7 +432,7 @@ define <vscale x 8 x i16> @vdivu_vx_nxv8i16(<vscale x 8 x i16> %va, i16 signext
 ; CHECK-LABEL: vdivu_vx_nxv8i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16,m2,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <vscale x 8 x i16> %head, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
@@ -462,7 +462,7 @@ define <vscale x 16 x i16> @vdivu_vv_nxv16i16(<vscale x 16 x i16> %va, <vscale x
 ; CHECK-LABEL: vdivu_vv_nxv16i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16,m4,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v12
+; CHECK-NEXT:    vdivu.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 16 x i16> %va, %vb
   ret <vscale x 16 x i16> %vc
@@ -472,7 +472,7 @@ define <vscale x 16 x i16> @vdivu_vx_nxv16i16(<vscale x 16 x i16> %va, i16 signe
 ; CHECK-LABEL: vdivu_vx_nxv16i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16,m4,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <vscale x 16 x i16> %head, <vscale x 16 x i16> undef, <vscale x 16 x i32> zeroinitializer
@@ -502,7 +502,7 @@ define <vscale x 32 x i16> @vdivu_vv_nxv32i16(<vscale x 32 x i16> %va, <vscale x
 ; CHECK-LABEL: vdivu_vv_nxv32i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16,m8,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v16
+; CHECK-NEXT:    vdivu.vv v8, v8, v16
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 32 x i16> %va, %vb
   ret <vscale x 32 x i16> %vc
@@ -512,7 +512,7 @@ define <vscale x 32 x i16> @vdivu_vx_nxv32i16(<vscale x 32 x i16> %va, i16 signe
 ; CHECK-LABEL: vdivu_vx_nxv32i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16,m8,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 32 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <vscale x 32 x i16> %head, <vscale x 32 x i16> undef, <vscale x 32 x i32> zeroinitializer
@@ -542,7 +542,7 @@ define <vscale x 1 x i32> @vdivu_vv_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1
 ; CHECK-LABEL: vdivu_vv_nxv1i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32,mf2,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v9
+; CHECK-NEXT:    vdivu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 1 x i32> %va, %vb
   ret <vscale x 1 x i32> %vc
@@ -552,7 +552,7 @@ define <vscale x 1 x i32> @vdivu_vx_nxv1i32(<vscale x 1 x i32> %va, i32 %b) {
 ; CHECK-LABEL: vdivu_vx_nxv1i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32,mf2,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 1 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <vscale x 1 x i32> %head, <vscale x 1 x i32> undef, <vscale x 1 x i32> zeroinitializer
@@ -582,7 +582,7 @@ define <vscale x 2 x i32> @vdivu_vv_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2
 ; CHECK-LABEL: vdivu_vv_nxv2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32,m1,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v9
+; CHECK-NEXT:    vdivu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 2 x i32> %va, %vb
   ret <vscale x 2 x i32> %vc
@@ -592,7 +592,7 @@ define <vscale x 2 x i32> @vdivu_vx_nxv2i32(<vscale x 2 x i32> %va, i32 %b) {
 ; CHECK-LABEL: vdivu_vx_nxv2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32,m1,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <vscale x 2 x i32> %head, <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer
@@ -622,7 +622,7 @@ define <vscale x 4 x i32> @vdivu_vv_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4
 ; CHECK-LABEL: vdivu_vv_nxv4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32,m2,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v10
+; CHECK-NEXT:    vdivu.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 4 x i32> %va, %vb
   ret <vscale x 4 x i32> %vc
@@ -632,7 +632,7 @@ define <vscale x 4 x i32> @vdivu_vx_nxv4i32(<vscale x 4 x i32> %va, i32 %b) {
 ; CHECK-LABEL: vdivu_vx_nxv4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32,m2,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <vscale x 4 x i32> %head, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
@@ -662,7 +662,7 @@ define <vscale x 8 x i32> @vdivu_vv_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8
 ; CHECK-LABEL: vdivu_vv_nxv8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32,m4,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v12
+; CHECK-NEXT:    vdivu.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 8 x i32> %va, %vb
   ret <vscale x 8 x i32> %vc
@@ -672,7 +672,7 @@ define <vscale x 8 x i32> @vdivu_vx_nxv8i32(<vscale x 8 x i32> %va, i32 %b) {
 ; CHECK-LABEL: vdivu_vx_nxv8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32,m4,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <vscale x 8 x i32> %head, <vscale x 8 x i32> undef, <vscale x 8 x i32> zeroinitializer
@@ -702,7 +702,7 @@ define <vscale x 16 x i32> @vdivu_vv_nxv16i32(<vscale x 16 x i32> %va, <vscale x
 ; CHECK-LABEL: vdivu_vv_nxv16i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32,m8,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v16
+; CHECK-NEXT:    vdivu.vv v8, v8, v16
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 16 x i32> %va, %vb
   ret <vscale x 16 x i32> %vc
@@ -712,7 +712,7 @@ define <vscale x 16 x i32> @vdivu_vx_nxv16i32(<vscale x 16 x i32> %va, i32 %b) {
 ; CHECK-LABEL: vdivu_vx_nxv16i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32,m8,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <vscale x 16 x i32> %head, <vscale x 16 x i32> undef, <vscale x 16 x i32> zeroinitializer
@@ -742,7 +742,7 @@ define <vscale x 1 x i64> @vdivu_vv_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1
 ; CHECK-LABEL: vdivu_vv_nxv1i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64,m1,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v9
+; CHECK-NEXT:    vdivu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 1 x i64> %va, %vb
   ret <vscale x 1 x i64> %vc
@@ -759,7 +759,7 @@ define <vscale x 1 x i64> @vdivu_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b) {
 ; CHECK-NEXT:    vsll.vx v26, v26, a1
 ; CHECK-NEXT:    vsrl.vx v26, v26, a1
 ; CHECK-NEXT:    vor.vv v25, v26, v25
-; CHECK-NEXT:    vdiv.vv v8, v8, v25
+; CHECK-NEXT:    vdivu.vv v8, v8, v25
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 1 x i64> undef, i64 %b, i32 0
   %splat = shufflevector <vscale x 1 x i64> %head, <vscale x 1 x i64> undef, <vscale x 1 x i32> zeroinitializer
@@ -796,7 +796,7 @@ define <vscale x 2 x i64> @vdivu_vv_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2
 ; CHECK-LABEL: vdivu_vv_nxv2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64,m2,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v10
+; CHECK-NEXT:    vdivu.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 2 x i64> %va, %vb
   ret <vscale x 2 x i64> %vc
@@ -813,7 +813,7 @@ define <vscale x 2 x i64> @vdivu_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b) {
 ; CHECK-NEXT:    vsll.vx v28, v28, a1
 ; CHECK-NEXT:    vsrl.vx v28, v28, a1
 ; CHECK-NEXT:    vor.vv v26, v28, v26
-; CHECK-NEXT:    vdiv.vv v8, v8, v26
+; CHECK-NEXT:    vdivu.vv v8, v8, v26
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x i64> undef, i64 %b, i32 0
   %splat = shufflevector <vscale x 2 x i64> %head, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
@@ -850,7 +850,7 @@ define <vscale x 4 x i64> @vdivu_vv_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4
 ; CHECK-LABEL: vdivu_vv_nxv4i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64,m4,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v12
+; CHECK-NEXT:    vdivu.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 4 x i64> %va, %vb
   ret <vscale x 4 x i64> %vc
@@ -867,7 +867,7 @@ define <vscale x 4 x i64> @vdivu_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b) {
 ; CHECK-NEXT:    vsll.vx v12, v12, a1
 ; CHECK-NEXT:    vsrl.vx v12, v12, a1
 ; CHECK-NEXT:    vor.vv v28, v12, v28
-; CHECK-NEXT:    vdiv.vv v8, v8, v28
+; CHECK-NEXT:    vdivu.vv v8, v8, v28
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x i64> undef, i64 %b, i32 0
   %splat = shufflevector <vscale x 4 x i64> %head, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
@@ -904,7 +904,7 @@ define <vscale x 8 x i64> @vdivu_vv_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8
 ; CHECK-LABEL: vdivu_vv_nxv8i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64,m8,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v16
+; CHECK-NEXT:    vdivu.vv v8, v8, v16
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 8 x i64> %va, %vb
   ret <vscale x 8 x i64> %vc
@@ -921,7 +921,7 @@ define <vscale x 8 x i64> @vdivu_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b) {
 ; CHECK-NEXT:    vsll.vx v24, v24, a1
 ; CHECK-NEXT:    vsrl.vx v24, v24, a1
 ; CHECK-NEXT:    vor.vv v16, v24, v16
-; CHECK-NEXT:    vdiv.vv v8, v8, v16
+; CHECK-NEXT:    vdivu.vv v8, v8, v16
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x i64> undef, i64 %b, i32 0
   %splat = shufflevector <vscale x 8 x i64> %head, <vscale x 8 x i64> undef, <vscale x 8 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vdivu-sdnode-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vdivu-sdnode-rv64.ll
index 70cd4fba1eb7..bc72099d75eb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vdivu-sdnode-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vdivu-sdnode-rv64.ll
@@ -5,7 +5,7 @@ define <vscale x 1 x i8> @vdivu_vv_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i
 ; CHECK-LABEL: vdivu_vv_nxv1i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8,mf8,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v9
+; CHECK-NEXT:    vdivu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 1 x i8> %va, %vb
   ret <vscale x 1 x i8> %vc
@@ -15,7 +15,7 @@ define <vscale x 1 x i8> @vdivu_vx_nxv1i8(<vscale x 1 x i8> %va, i8 signext %b)
 ; CHECK-LABEL: vdivu_vx_nxv1i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8,mf8,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 1 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <vscale x 1 x i8> %head, <vscale x 1 x i8> undef, <vscale x 1 x i32> zeroinitializer
@@ -44,7 +44,7 @@ define <vscale x 2 x i8> @vdivu_vv_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i
 ; CHECK-LABEL: vdivu_vv_nxv2i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8,mf4,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v9
+; CHECK-NEXT:    vdivu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 2 x i8> %va, %vb
   ret <vscale x 2 x i8> %vc
@@ -54,7 +54,7 @@ define <vscale x 2 x i8> @vdivu_vx_nxv2i8(<vscale x 2 x i8> %va, i8 signext %b)
 ; CHECK-LABEL: vdivu_vx_nxv2i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8,mf4,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <vscale x 2 x i8> %head, <vscale x 2 x i8> undef, <vscale x 2 x i32> zeroinitializer
@@ -83,7 +83,7 @@ define <vscale x 4 x i8> @vdivu_vv_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i
 ; CHECK-LABEL: vdivu_vv_nxv4i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8,mf2,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v9
+; CHECK-NEXT:    vdivu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 4 x i8> %va, %vb
   ret <vscale x 4 x i8> %vc
@@ -93,7 +93,7 @@ define <vscale x 4 x i8> @vdivu_vx_nxv4i8(<vscale x 4 x i8> %va, i8 signext %b)
 ; CHECK-LABEL: vdivu_vx_nxv4i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8,mf2,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <vscale x 4 x i8> %head, <vscale x 4 x i8> undef, <vscale x 4 x i32> zeroinitializer
@@ -122,7 +122,7 @@ define <vscale x 8 x i8> @vdivu_vv_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i
 ; CHECK-LABEL: vdivu_vv_nxv8i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8,m1,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v9
+; CHECK-NEXT:    vdivu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 8 x i8> %va, %vb
   ret <vscale x 8 x i8> %vc
@@ -132,7 +132,7 @@ define <vscale x 8 x i8> @vdivu_vx_nxv8i8(<vscale x 8 x i8> %va, i8 signext %b)
 ; CHECK-LABEL: vdivu_vx_nxv8i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8,m1,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <vscale x 8 x i8> %head, <vscale x 8 x i8> undef, <vscale x 8 x i32> zeroinitializer
@@ -161,7 +161,7 @@ define <vscale x 16 x i8> @vdivu_vv_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16
 ; CHECK-LABEL: vdivu_vv_nxv16i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8,m2,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v10
+; CHECK-NEXT:    vdivu.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 16 x i8> %va, %vb
   ret <vscale x 16 x i8> %vc
@@ -171,7 +171,7 @@ define <vscale x 16 x i8> @vdivu_vx_nxv16i8(<vscale x 16 x i8> %va, i8 signext %
 ; CHECK-LABEL: vdivu_vx_nxv16i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8,m2,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <vscale x 16 x i8> %head, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
@@ -200,7 +200,7 @@ define <vscale x 32 x i8> @vdivu_vv_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32
 ; CHECK-LABEL: vdivu_vv_nxv32i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8,m4,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v12
+; CHECK-NEXT:    vdivu.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 32 x i8> %va, %vb
   ret <vscale x 32 x i8> %vc
@@ -210,7 +210,7 @@ define <vscale x 32 x i8> @vdivu_vx_nxv32i8(<vscale x 32 x i8> %va, i8 signext %
 ; CHECK-LABEL: vdivu_vx_nxv32i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8,m4,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 32 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <vscale x 32 x i8> %head, <vscale x 32 x i8> undef, <vscale x 32 x i32> zeroinitializer
@@ -239,7 +239,7 @@ define <vscale x 64 x i8> @vdivu_vv_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64
 ; CHECK-LABEL: vdivu_vv_nxv64i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8,m8,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v16
+; CHECK-NEXT:    vdivu.vv v8, v8, v16
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 64 x i8> %va, %vb
   ret <vscale x 64 x i8> %vc
@@ -249,7 +249,7 @@ define <vscale x 64 x i8> @vdivu_vx_nxv64i8(<vscale x 64 x i8> %va, i8 signext %
 ; CHECK-LABEL: vdivu_vx_nxv64i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8,m8,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 64 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <vscale x 64 x i8> %head, <vscale x 64 x i8> undef, <vscale x 64 x i32> zeroinitializer
@@ -278,7 +278,7 @@ define <vscale x 1 x i16> @vdivu_vv_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1
 ; CHECK-LABEL: vdivu_vv_nxv1i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16,mf4,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v9
+; CHECK-NEXT:    vdivu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 1 x i16> %va, %vb
   ret <vscale x 1 x i16> %vc
@@ -288,7 +288,7 @@ define <vscale x 1 x i16> @vdivu_vx_nxv1i16(<vscale x 1 x i16> %va, i16 signext
 ; CHECK-LABEL: vdivu_vx_nxv1i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16,mf4,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 1 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <vscale x 1 x i16> %head, <vscale x 1 x i16> undef, <vscale x 1 x i32> zeroinitializer
@@ -318,7 +318,7 @@ define <vscale x 2 x i16> @vdivu_vv_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2
 ; CHECK-LABEL: vdivu_vv_nxv2i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16,mf2,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v9
+; CHECK-NEXT:    vdivu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 2 x i16> %va, %vb
   ret <vscale x 2 x i16> %vc
@@ -328,7 +328,7 @@ define <vscale x 2 x i16> @vdivu_vx_nxv2i16(<vscale x 2 x i16> %va, i16 signext
 ; CHECK-LABEL: vdivu_vx_nxv2i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16,mf2,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <vscale x 2 x i16> %head, <vscale x 2 x i16> undef, <vscale x 2 x i32> zeroinitializer
@@ -358,7 +358,7 @@ define <vscale x 4 x i16> @vdivu_vv_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4
 ; CHECK-LABEL: vdivu_vv_nxv4i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16,m1,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v9
+; CHECK-NEXT:    vdivu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 4 x i16> %va, %vb
   ret <vscale x 4 x i16> %vc
@@ -368,7 +368,7 @@ define <vscale x 4 x i16> @vdivu_vx_nxv4i16(<vscale x 4 x i16> %va, i16 signext
 ; CHECK-LABEL: vdivu_vx_nxv4i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16,m1,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <vscale x 4 x i16> %head, <vscale x 4 x i16> undef, <vscale x 4 x i32> zeroinitializer
@@ -398,7 +398,7 @@ define <vscale x 8 x i16> @vdivu_vv_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8
 ; CHECK-LABEL: vdivu_vv_nxv8i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16,m2,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v10
+; CHECK-NEXT:    vdivu.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 8 x i16> %va, %vb
   ret <vscale x 8 x i16> %vc
@@ -408,7 +408,7 @@ define <vscale x 8 x i16> @vdivu_vx_nxv8i16(<vscale x 8 x i16> %va, i16 signext
 ; CHECK-LABEL: vdivu_vx_nxv8i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16,m2,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <vscale x 8 x i16> %head, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
@@ -438,7 +438,7 @@ define <vscale x 16 x i16> @vdivu_vv_nxv16i16(<vscale x 16 x i16> %va, <vscale x
 ; CHECK-LABEL: vdivu_vv_nxv16i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16,m4,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v12
+; CHECK-NEXT:    vdivu.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 16 x i16> %va, %vb
   ret <vscale x 16 x i16> %vc
@@ -448,7 +448,7 @@ define <vscale x 16 x i16> @vdivu_vx_nxv16i16(<vscale x 16 x i16> %va, i16 signe
 ; CHECK-LABEL: vdivu_vx_nxv16i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16,m4,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <vscale x 16 x i16> %head, <vscale x 16 x i16> undef, <vscale x 16 x i32> zeroinitializer
@@ -478,7 +478,7 @@ define <vscale x 32 x i16> @vdivu_vv_nxv32i16(<vscale x 32 x i16> %va, <vscale x
 ; CHECK-LABEL: vdivu_vv_nxv32i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16,m8,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v16
+; CHECK-NEXT:    vdivu.vv v8, v8, v16
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 32 x i16> %va, %vb
   ret <vscale x 32 x i16> %vc
@@ -488,7 +488,7 @@ define <vscale x 32 x i16> @vdivu_vx_nxv32i16(<vscale x 32 x i16> %va, i16 signe
 ; CHECK-LABEL: vdivu_vx_nxv32i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16,m8,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 32 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <vscale x 32 x i16> %head, <vscale x 32 x i16> undef, <vscale x 32 x i32> zeroinitializer
@@ -518,7 +518,7 @@ define <vscale x 1 x i32> @vdivu_vv_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1
 ; CHECK-LABEL: vdivu_vv_nxv1i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32,mf2,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v9
+; CHECK-NEXT:    vdivu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 1 x i32> %va, %vb
   ret <vscale x 1 x i32> %vc
@@ -528,7 +528,7 @@ define <vscale x 1 x i32> @vdivu_vx_nxv1i32(<vscale x 1 x i32> %va, i32 signext
 ; CHECK-LABEL: vdivu_vx_nxv1i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32,mf2,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 1 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <vscale x 1 x i32> %head, <vscale x 1 x i32> undef, <vscale x 1 x i32> zeroinitializer
@@ -558,7 +558,7 @@ define <vscale x 2 x i32> @vdivu_vv_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2
 ; CHECK-LABEL: vdivu_vv_nxv2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32,m1,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v9
+; CHECK-NEXT:    vdivu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 2 x i32> %va, %vb
   ret <vscale x 2 x i32> %vc
@@ -568,7 +568,7 @@ define <vscale x 2 x i32> @vdivu_vx_nxv2i32(<vscale x 2 x i32> %va, i32 signext
 ; CHECK-LABEL: vdivu_vx_nxv2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32,m1,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <vscale x 2 x i32> %head, <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer
@@ -598,7 +598,7 @@ define <vscale x 4 x i32> @vdivu_vv_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4
 ; CHECK-LABEL: vdivu_vv_nxv4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32,m2,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v10
+; CHECK-NEXT:    vdivu.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 4 x i32> %va, %vb
   ret <vscale x 4 x i32> %vc
@@ -608,7 +608,7 @@ define <vscale x 4 x i32> @vdivu_vx_nxv4i32(<vscale x 4 x i32> %va, i32 signext
 ; CHECK-LABEL: vdivu_vx_nxv4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32,m2,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <vscale x 4 x i32> %head, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
@@ -638,7 +638,7 @@ define <vscale x 8 x i32> @vdivu_vv_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8
 ; CHECK-LABEL: vdivu_vv_nxv8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32,m4,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v12
+; CHECK-NEXT:    vdivu.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 8 x i32> %va, %vb
   ret <vscale x 8 x i32> %vc
@@ -648,7 +648,7 @@ define <vscale x 8 x i32> @vdivu_vx_nxv8i32(<vscale x 8 x i32> %va, i32 signext
 ; CHECK-LABEL: vdivu_vx_nxv8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32,m4,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <vscale x 8 x i32> %head, <vscale x 8 x i32> undef, <vscale x 8 x i32> zeroinitializer
@@ -678,7 +678,7 @@ define <vscale x 16 x i32> @vdivu_vv_nxv16i32(<vscale x 16 x i32> %va, <vscale x
 ; CHECK-LABEL: vdivu_vv_nxv16i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32,m8,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v16
+; CHECK-NEXT:    vdivu.vv v8, v8, v16
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 16 x i32> %va, %vb
   ret <vscale x 16 x i32> %vc
@@ -688,7 +688,7 @@ define <vscale x 16 x i32> @vdivu_vx_nxv16i32(<vscale x 16 x i32> %va, i32 signe
 ; CHECK-LABEL: vdivu_vx_nxv16i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e32,m8,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x i32> undef, i32 %b, i32 0
   %splat = shufflevector <vscale x 16 x i32> %head, <vscale x 16 x i32> undef, <vscale x 16 x i32> zeroinitializer
@@ -718,7 +718,7 @@ define <vscale x 1 x i64> @vdivu_vv_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1
 ; CHECK-LABEL: vdivu_vv_nxv1i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64,m1,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v9
+; CHECK-NEXT:    vdivu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 1 x i64> %va, %vb
   ret <vscale x 1 x i64> %vc
@@ -728,7 +728,7 @@ define <vscale x 1 x i64> @vdivu_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b) {
 ; CHECK-LABEL: vdivu_vx_nxv1i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e64,m1,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 1 x i64> undef, i64 %b, i32 0
   %splat = shufflevector <vscale x 1 x i64> %head, <vscale x 1 x i64> undef, <vscale x 1 x i32> zeroinitializer
@@ -760,7 +760,7 @@ define <vscale x 2 x i64> @vdivu_vv_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2
 ; CHECK-LABEL: vdivu_vv_nxv2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64,m2,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v10
+; CHECK-NEXT:    vdivu.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 2 x i64> %va, %vb
   ret <vscale x 2 x i64> %vc
@@ -770,7 +770,7 @@ define <vscale x 2 x i64> @vdivu_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b) {
 ; CHECK-LABEL: vdivu_vx_nxv2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e64,m2,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x i64> undef, i64 %b, i32 0
   %splat = shufflevector <vscale x 2 x i64> %head, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
@@ -802,7 +802,7 @@ define <vscale x 4 x i64> @vdivu_vv_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4
 ; CHECK-LABEL: vdivu_vv_nxv4i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64,m4,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v12
+; CHECK-NEXT:    vdivu.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 4 x i64> %va, %vb
   ret <vscale x 4 x i64> %vc
@@ -812,7 +812,7 @@ define <vscale x 4 x i64> @vdivu_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b) {
 ; CHECK-LABEL: vdivu_vx_nxv4i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e64,m4,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x i64> undef, i64 %b, i32 0
   %splat = shufflevector <vscale x 4 x i64> %head, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
@@ -844,7 +844,7 @@ define <vscale x 8 x i64> @vdivu_vv_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8
 ; CHECK-LABEL: vdivu_vv_nxv8i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64,m8,ta,mu
-; CHECK-NEXT:    vdiv.vv v8, v8, v16
+; CHECK-NEXT:    vdivu.vv v8, v8, v16
 ; CHECK-NEXT:    ret
   %vc = udiv <vscale x 8 x i64> %va, %vb
   ret <vscale x 8 x i64> %vc
@@ -854,7 +854,7 @@ define <vscale x 8 x i64> @vdivu_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b) {
 ; CHECK-LABEL: vdivu_vx_nxv8i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e64,m8,ta,mu
-; CHECK-NEXT:    vdiv.vx v8, v8, a0
+; CHECK-NEXT:    vdivu.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x i64> undef, i64 %b, i32 0
   %splat = shufflevector <vscale x 8 x i64> %head, <vscale x 8 x i64> undef, <vscale x 8 x i32> zeroinitializer

From 2cf21fd6a5b4a6f0f0da55717a787fc38202cca8 Mon Sep 17 00:00:00 2001
From: Joachim Meyer <joachim@joameyer.de>
Date: Thu, 17 Dec 2020 23:58:13 +0100
Subject: [PATCH 083/318] [Support] Indent multi-line descr of enum cli
 options.

As noted in https://reviews.llvm.org/D93459, the formatting of
multi-line descriptions of clEnumValN and the likes is unfavorable.
Thus this patch adds support for correctly indenting these.

Reviewed By: serge-sans-paille

Differential Revision: https://reviews.llvm.org/D93494

(cherry picked from commit e3f02302e318837d2421c6425450f04ae0a82b90)
---
 llvm/include/llvm/Support/CommandLine.h    | 13 +++++++++++
 llvm/lib/Support/CommandLine.cpp           | 25 ++++++++++++++++------
 llvm/unittests/Support/CommandLineTest.cpp | 22 +++++++++++++++++++
 3 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/llvm/include/llvm/Support/CommandLine.h b/llvm/include/llvm/Support/CommandLine.h
index 38f3e188be55..0706aa226c0e 100644
--- a/llvm/include/llvm/Support/CommandLine.h
+++ b/llvm/include/llvm/Support/CommandLine.h
@@ -369,9 +369,22 @@ class Option {
 
   virtual void setDefault() = 0;
 
+  // Prints the help string for an option.
+  //
+  // This maintains the Indent for multi-line descriptions.
+  // FirstLineIndentedBy is the count of chars of the first line
+  //      i.e. the one containing the --<option name>.
   static void printHelpStr(StringRef HelpStr, size_t Indent,
                            size_t FirstLineIndentedBy);
 
+  // Prints the help string for an enum value.
+  //
+  // This maintains the Indent for multi-line descriptions.
+  // FirstLineIndentedBy is the count of chars of the first line
+  //      i.e. the one containing the =<value>.
+  static void printEnumValHelpStr(StringRef HelpStr, size_t Indent,
+                                  size_t FirstLineIndentedBy);
+
   virtual void getExtraOptionNames(SmallVectorImpl<StringRef> &) {}
 
   // addOccurrence - Wrapper around handleOccurrence that enforces Flags.
diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp
index 6d89481bf28a..e2f014d1815b 100644
--- a/llvm/lib/Support/CommandLine.cpp
+++ b/llvm/lib/Support/CommandLine.cpp
@@ -1726,6 +1726,19 @@ void Option::printHelpStr(StringRef HelpStr, size_t Indent,
   }
 }
 
+void Option::printEnumValHelpStr(StringRef HelpStr, size_t BaseIndent,
+                                 size_t FirstLineIndentedBy) {
+  const StringRef ValHelpPrefix = "  ";
+  assert(BaseIndent >= FirstLineIndentedBy + ValHelpPrefix.size());
+  std::pair<StringRef, StringRef> Split = HelpStr.split('\n');
+  outs().indent(BaseIndent - FirstLineIndentedBy)
+      << ArgHelpPrefix << ValHelpPrefix << Split.first << "\n";
+  while (!Split.second.empty()) {
+    Split = Split.second.split('\n');
+    outs().indent(BaseIndent + ValHelpPrefix.size()) << Split.first << "\n";
+  }
+}
+
 // Print out the option for the alias.
 void alias::printOptionInfo(size_t GlobalWidth) const {
   outs() << PrintArg(ArgStr);
@@ -1971,17 +1984,17 @@ void generic_parser_base::printOptionInfo(const Option &O,
       StringRef Description = getDescription(i);
       if (!shouldPrintOption(OptionName, Description, O))
         continue;
-      assert(GlobalWidth >= OptionName.size() + OptionPrefixesSize);
-      size_t NumSpaces = GlobalWidth - OptionName.size() - OptionPrefixesSize;
+      size_t FirstLineIndent = OptionName.size() + OptionPrefixesSize;
       outs() << OptionPrefix << OptionName;
       if (OptionName.empty()) {
         outs() << EmptyOption;
-        assert(NumSpaces >= EmptyOption.size());
-        NumSpaces -= EmptyOption.size();
+        assert(FirstLineIndent >= EmptyOption.size());
+        FirstLineIndent += EmptyOption.size();
       }
       if (!Description.empty())
-        outs().indent(NumSpaces) << ArgHelpPrefix << "  " << Description;
-      outs() << '\n';
+        Option::printEnumValHelpStr(Description, GlobalWidth, FirstLineIndent);
+      else
+        outs() << '\n';
     }
   } else {
     if (!O.HelpStr.empty())
diff --git a/llvm/unittests/Support/CommandLineTest.cpp b/llvm/unittests/Support/CommandLineTest.cpp
index a05f3894ef05..4accdc5ea1fb 100644
--- a/llvm/unittests/Support/CommandLineTest.cpp
+++ b/llvm/unittests/Support/CommandLineTest.cpp
@@ -1263,6 +1263,28 @@ TEST_F(PrintOptionInfoTest, PrintOptionInfoEmptyValueDescription) {
   // clang-format on
 }
 
+TEST_F(PrintOptionInfoTest, PrintOptionInfoMultilineValueDescription) {
+  std::string Output =
+      runTest(cl::ValueRequired,
+              cl::values(clEnumValN(OptionValue::Val, "v1",
+                                    "This is the first enum value\n"
+                                    "which has a really long description\n"
+                                    "thus it is multi-line."),
+                         clEnumValN(OptionValue::Val, "",
+                                    "This is an unnamed enum value option\n"
+                                    "Should be indented as well")));
+
+  // clang-format off
+  EXPECT_EQ(Output,
+            ("  --" + Opt + "=<value> - " + HelpText + "\n"
+             "    =v1                 -   This is the first enum value\n"
+             "                            which has a really long description\n"
+             "                            thus it is multi-line.\n"
+             "    =<empty>            -   This is an unnamed enum value option\n"
+             "                            Should be indented as well\n").str());
+  // clang-format on
+}
+
 class GetOptionWidthTest : public ::testing::Test {
 public:
   enum class OptionValue { Val };

From a67a4346f78d856c6224578392b128ab1d55009e Mon Sep 17 00:00:00 2001
From: Ayke van Laethem <aykevanlaethem@gmail.com>
Date: Tue, 2 Feb 2021 20:58:31 +0100
Subject: [PATCH 084/318] [ARM] Do not emit ldrexd/strexd on Cortex-M chips

The ldrexd/strexd instructions are not supported on M-class chips, see
for example
https://developer.arm.com/documentation/dui0489/e/arm-and-thumb-instructions/memory-access-instructions/ldrex-and-strex
which says:

> All these 32-bit Thumb instructions are available in ARMv6T2 and
> above, except that LDREXD and STREXD are not available in the ARMv7-M
> architecture.

Looking at the ARMv8-M architecture, it appears that these instructions
aren't supported either. The Architecture Reference Manual lists
ldrex/strex but not ldrexd/strexd:
https://developer.arm.com/documentation/ddi0553/bn/

Godbolt example on LLVM 11.0.0, which incorrectly emits ldrexd/strexd
instructions: https://llvm.godbolt.org/z/5qqPnE

Differential Revision: https://reviews.llvm.org/D95891

(cherry picked from commit aecdf15cc7f866180dc769265b8183cad34bb33a)
---
 llvm/lib/Target/ARM/ARMISelLowering.cpp |  6 ++++-
 llvm/test/CodeGen/ARM/atomic-64bit.ll   | 35 +++++++++++++++++++++++--
 2 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 397979b4ab1e..598062672a56 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -18661,6 +18661,8 @@ ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
              : AtomicExpansionKind::None;
 }
 
+// Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used  up to 32
+// bits, and up to 64 bits on the non-M profiles.
 TargetLowering::AtomicExpansionKind
 ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {
   // At -O0, fast-regalloc cannot cope with the live vregs necessary to
@@ -18668,9 +18670,11 @@ ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {
   // on the stack and close enough to the spill slot, this can lead to a
   // situation where the monitor always gets cleared and the atomic operation
   // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
+  unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits();
   bool HasAtomicCmpXchg =
       !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
-  if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg)
+  if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg &&
+      Size <= (Subtarget->isMClass() ? 32U : 64U))
     return AtomicExpansionKind::LLSC;
   return AtomicExpansionKind::None;
 }
diff --git a/llvm/test/CodeGen/ARM/atomic-64bit.ll b/llvm/test/CodeGen/ARM/atomic-64bit.ll
index 8841483c97a4..eadefcd23bc6 100644
--- a/llvm/test/CodeGen/ARM/atomic-64bit.ll
+++ b/llvm/test/CodeGen/ARM/atomic-64bit.ll
@@ -2,6 +2,8 @@
 ; RUN: llc < %s -mtriple=thumbv7-none-linux-gnueabihf | FileCheck %s --check-prefix=CHECK-THUMB --check-prefix=CHECK-THUMB-LE
 ; RUN: llc < %s -mtriple=armebv7 -target-abi apcs | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
 ; RUN: llc < %s -mtriple=thumbebv7-none-linux-gnueabihf | FileCheck %s --check-prefix=CHECK-THUMB --check-prefix=CHECK-THUMB-BE
+; RUN: llc < %s -mtriple=armv7m--none-eabi | FileCheck %s --check-prefix=CHECK-M
+; RUN: llc < %s -mtriple=armv8m--none-eabi | FileCheck %s --check-prefix=CHECK-M
 
 define i64 @test1(i64* %ptr, i64 %val) {
 ; CHECK-LABEL: test1:
@@ -28,6 +30,8 @@ define i64 @test1(i64* %ptr, i64 %val) {
 ; CHECK-THUMB: bne
 ; CHECK-THUMB: dmb {{ish$}}
 
+; CHECK-M: __sync_fetch_and_add_8
+
   %r = atomicrmw add i64* %ptr, i64 %val seq_cst
   ret i64 %r
 }
@@ -57,6 +61,8 @@ define i64 @test2(i64* %ptr, i64 %val) {
 ; CHECK-THUMB: bne
 ; CHECK-THUMB: dmb {{ish$}}
 
+; CHECK-M: __sync_fetch_and_sub_8
+
   %r = atomicrmw sub i64* %ptr, i64 %val seq_cst
   ret i64 %r
 }
@@ -86,6 +92,8 @@ define i64 @test3(i64* %ptr, i64 %val) {
 ; CHECK-THUMB: bne
 ; CHECK-THUMB: dmb {{ish$}}
 
+; CHECK-M: __sync_fetch_and_and_8
+
   %r = atomicrmw and i64* %ptr, i64 %val seq_cst
   ret i64 %r
 }
@@ -115,6 +123,8 @@ define i64 @test4(i64* %ptr, i64 %val) {
 ; CHECK-THUMB: bne
 ; CHECK-THUMB: dmb {{ish$}}
 
+; CHECK-M: __sync_fetch_and_or_8
+
   %r = atomicrmw or i64* %ptr, i64 %val seq_cst
   ret i64 %r
 }
@@ -144,6 +154,8 @@ define i64 @test5(i64* %ptr, i64 %val) {
 ; CHECK-THUMB: bne
 ; CHECK-THUMB: dmb {{ish$}}
 
+; CHECK-M: __sync_fetch_and_xor_8
+
   %r = atomicrmw xor i64* %ptr, i64 %val seq_cst
   ret i64 %r
 }
@@ -165,6 +177,8 @@ define i64 @test6(i64* %ptr, i64 %val) {
 ; CHECK-THUMB: bne
 ; CHECK-THUMB: dmb {{ish$}}
 
+; CHECK-M: __sync_lock_test_and_set_8
+
   %r = atomicrmw xchg i64* %ptr, i64 %val seq_cst
   ret i64 %r
 }
@@ -199,12 +213,15 @@ define i64 @test7(i64* %ptr, i64 %val1, i64 %val2) {
 ; CHECK-THUMB: beq
 ; CHECK-THUMB: dmb {{ish$}}
 
+; CHECK-M: __sync_val_compare_and_swap_8
+
   %pair = cmpxchg i64* %ptr, i64 %val1, i64 %val2 seq_cst seq_cst
   %r = extractvalue { i64, i1 } %pair, 0
   ret i64 %r
 }
 
-; Compiles down to a single ldrexd
+; Compiles down to a single ldrexd, except on M class devices where ldrexd
+; isn't supported.
 define i64 @test8(i64* %ptr) {
 ; CHECK-LABEL: test8:
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
@@ -220,12 +237,15 @@ define i64 @test8(i64* %ptr) {
 ; CHECK-THUMB-NOT: strexd
 ; CHECK-THUMB: dmb {{ish$}}
 
+; CHECK-M: __sync_val_compare_and_swap_8
+
   %r = load atomic i64, i64* %ptr seq_cst, align 8
   ret i64 %r
 }
 
 ; Compiles down to atomicrmw xchg; there really isn't any more efficient
-; way to write it.
+; way to write it. Except on M class devices, where ldrexd/strexd aren't
+; supported.
 define void @test9(i64* %ptr, i64 %val) {
 ; CHECK-LABEL: test9:
 ; CHECK: dmb {{ish$}}
@@ -243,6 +263,8 @@ define void @test9(i64* %ptr, i64 %val) {
 ; CHECK-THUMB: bne
 ; CHECK-THUMB: dmb {{ish$}}
 
+; CHECK-M: __sync_lock_test_and_set_8
+
   store atomic i64 %val, i64* %ptr seq_cst, align 8
   ret void
 }
@@ -286,6 +308,8 @@ define i64 @test10(i64* %ptr, i64 %val) {
 ; CHECK-THUMB: bne
 ; CHECK-THUMB: dmb {{ish$}}
 
+; CHECK-M: __sync_fetch_and_min_8
+
   %r = atomicrmw min i64* %ptr, i64 %val seq_cst
   ret i64 %r
 }
@@ -329,6 +353,8 @@ define i64 @test11(i64* %ptr, i64 %val) {
 ; CHECK-THUMB: bne
 ; CHECK-THUMB: dmb {{ish$}}
 
+; CHECK-M: __sync_fetch_and_umin_8
+
   %r = atomicrmw umin i64* %ptr, i64 %val seq_cst
   ret i64 %r
 }
@@ -372,6 +398,8 @@ define i64 @test12(i64* %ptr, i64 %val) {
 ; CHECK-THUMB: bne
 ; CHECK-THUMB: dmb {{ish$}}
 
+; CHECK-M: __sync_fetch_and_max_8
+
   %r = atomicrmw max i64* %ptr, i64 %val seq_cst
   ret i64 %r
 }
@@ -414,6 +442,9 @@ define i64 @test13(i64* %ptr, i64 %val) {
 ; CHECK-THUMB: cmp
 ; CHECK-THUMB: bne
 ; CHECK-THUMB: dmb {{ish$}}
+
+; CHECK-M: __sync_fetch_and_umax_8
+
   %r = atomicrmw umax i64* %ptr, i64 %val seq_cst
   ret i64 %r
 }

From dccfafaf8cc1085724331a8427f656d7636679ba Mon Sep 17 00:00:00 2001
From: Richard Smith <richard@metafoo.co.uk>
Date: Thu, 4 Feb 2021 13:38:38 -0800
Subject: [PATCH 085/318] Revert "[BuildLibcalls, Attrs] Support more variants
 of C++'s new, add attributes for C++'s delete"

Several of the new attributes here were incorrect, and even the ones
that are generally correct were being added even to nobuiltin calls.

This reverts commit bb3f169b59e1c8bd7fd70097532220bbd11e9967.

(cherry picked from commit 1484ad4137b5d627573672bad48b03785f8fdefd)
---
 llvm/lib/Transforms/Utils/BuildLibCalls.cpp   | 42 -------------------
 .../Transforms/InferFunctionAttrs/annotate.ll | 36 +++-------------
 2 files changed, 5 insertions(+), 73 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index f4afa3ad4623..811b9c04906d 100644
--- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -1005,52 +1005,10 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
-  case LibFunc_ZdlPvRKSt9nothrow_t: // delete(void*, nothrow)
-  case LibFunc_ZdlPvSt11align_val_tRKSt9nothrow_t: // delete(void*, align_val_t, nothrow)
-  case LibFunc_ZdaPvRKSt9nothrow_t: // delete[](void*, nothrow)
-  case LibFunc_ZdaPvSt11align_val_tRKSt9nothrow_t: // delete[](void*, align_val_t, nothrow)
-    Changed |= setDoesNotThrow(F);
-    LLVM_FALLTHROUGH;
-  case LibFunc_ZdlPv: // delete(void*)
-  case LibFunc_ZdlPvj: // delete(void*, unsigned int)
-  case LibFunc_ZdlPvm: // delete(void*, unsigned long)
-  case LibFunc_ZdaPv: // delete[](void*)
-  case LibFunc_ZdaPvj: // delete[](void*, unsigned int)
-  case LibFunc_ZdaPvm: // delete[](void*, unsigned long)
-  case LibFunc_ZdlPvSt11align_val_t: // delete(void*, align_val_t)
-  case LibFunc_ZdlPvjSt11align_val_t: // delete(void*, unsigned int, align_val_t)
-  case LibFunc_ZdlPvmSt11align_val_t: // delete(void*, unsigned long, align_val_t)
-  case LibFunc_ZdaPvSt11align_val_t: // delete[](void*, align_val_t)
-  case LibFunc_ZdaPvjSt11align_val_t: // delete[](void*, unsigned int, align_val_t)
-  case LibFunc_ZdaPvmSt11align_val_t: // delete[](void*, unsigned long, align_val_t);
-    Changed |= setOnlyAccessesInaccessibleMemOrArgMem(F);
-    Changed |= setArgsNoUndef(F);
-    Changed |= setWillReturn(F);
-    Changed |= setDoesNotCapture(F, 0);
-    return Changed;
-  case LibFunc_ZnwjRKSt9nothrow_t: // new(unsigned int, nothrow)
-  case LibFunc_ZnwmRKSt9nothrow_t: // new(unsigned long, nothrow)
-  case LibFunc_ZnajRKSt9nothrow_t: // new[](unsigned int, nothrow)
-  case LibFunc_ZnamRKSt9nothrow_t: // new[](unsigned long, nothrow)
-  case LibFunc_ZnwjSt11align_val_tRKSt9nothrow_t: // new(unsigned int, align_val_t, nothrow)
-  case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t: // new(unsigned long, align_val_t, nothrow)
-  case LibFunc_ZnajSt11align_val_tRKSt9nothrow_t: // new[](unsigned int, align_val_t, nothrow)
-  case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t: // new[](unsigned long, align_val_t, nothrow)
-    // Nothrow operator new may return null pointer
-    Changed |= setDoesNotThrow(F);
-    Changed |= setOnlyAccessesInaccessibleMemory(F);
-    Changed |= setRetNoUndef(F);
-    Changed |= setRetDoesNotAlias(F);
-    Changed |= setWillReturn(F);
-    return Changed;
   case LibFunc_Znwj: // new(unsigned int)
   case LibFunc_Znwm: // new(unsigned long)
   case LibFunc_Znaj: // new[](unsigned int)
   case LibFunc_Znam: // new[](unsigned long)
-  case LibFunc_ZnwjSt11align_val_t: // new(unsigned int, align_val_t)
-  case LibFunc_ZnwmSt11align_val_t: // new(unsigned long, align_val_t)
-  case LibFunc_ZnajSt11align_val_t: // new[](unsigned int, align_val_t)
-  case LibFunc_ZnamSt11align_val_t: // new[](unsigned long, align_val_t)
   case LibFunc_msvc_new_int: // new(unsigned int)
   case LibFunc_msvc_new_longlong: // new(unsigned long long)
   case LibFunc_msvc_new_array_int: // new[](unsigned int)
diff --git a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll
index a847db7eb550..0af18151d6f6 100644
--- a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll
+++ b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll
@@ -9,30 +9,6 @@ declare i8* @_Znwj(i64 )
 ; CHECK: declare noalias noundef nonnull i8* @_Znwj(i64) [[INACCESSIBLEMEMONLY_NOFREE_WILLRETURN:#[0-9]+]]
 declare i8* @_Znwm(i64)
 ; CHECK: declare noalias noundef nonnull i8* @_Znwm(i64) [[INACCESSIBLEMEMONLY_NOFREE_WILLRETURN]]
-declare i8* @_Znaj(i64)
-; CHECK: declare noalias noundef nonnull i8* @_Znaj(i64) [[INACCESSIBLEMEMONLY_NOFREE_WILLRETURN]]
-declare i8* @_Znam(i64)
-; CHECK: declare noalias noundef nonnull i8* @_Znam(i64) [[INACCESSIBLEMEMONLY_NOFREE_WILLRETURN]]
-
-
-%"struct.std::nothrow_t" = type { i8 }
-declare i8* @_ZnwmRKSt9nothrow_t(i64, %"struct.std::nothrow_t"*)
-; CHECK: declare noalias noundef i8* @_ZnwmRKSt9nothrow_t(i64, %"struct.std::nothrow_t"*) [[INACCESSIBLEMEMONLY_NOFREE_NOUNWIND_WILLRETURN:#[0-9]+]]
-
-
-; operator delete routines
-declare void @_ZdlPv(i8*)
-; CHECK: declare void @_ZdlPv(i8* nocapture noundef) [[INACCESSIBLEMEMORARGONLY_WILLRETURN:#[0-9]+]]
-declare void @_ZdlPvj(i8*, i64)
-; CHECK: declare void @_ZdlPvj(i8* nocapture noundef, i64 noundef) [[INACCESSIBLEMEMORARGONLY_WILLRETURN]]
-declare void @_ZdlPvm(i8*, i64)
-; CHECK: declare void @_ZdlPvm(i8* nocapture noundef, i64 noundef) [[INACCESSIBLEMEMORARGONLY_WILLRETURN]]
-declare void @_ZdaPv(i8*)
-; CHECK: declare void @_ZdaPv(i8* nocapture noundef) [[INACCESSIBLEMEMORARGONLY_WILLRETURN]]
-declare void @_ZdaPvj(i8*, i64)
-; CHECK: declare void @_ZdaPvj(i8* nocapture noundef, i64 noundef) [[INACCESSIBLEMEMORARGONLY_WILLRETURN]]
-declare void @_ZdaPvm(i8*, i64)
-; CHECK: declare void @_ZdaPvm(i8* nocapture noundef, i64 noundef) [[INACCESSIBLEMEMORARGONLY_WILLRETURN]]
 
 declare i32 @__nvvm_reflect(i8*)
 ; CHECK-NVPTX: declare noundef i32 @__nvvm_reflect(i8* noundef) [[NOFREE_NOUNWIND_READNONE:#[0-9]+]]
@@ -280,7 +256,7 @@ declare void @bcopy(i8*, i8*, i64)
 ; CHECK: declare void @bzero(i8* nocapture writeonly, i64)  [[ARGMEMONLY_NOFREE_NOUNWIND:#[0-9]+]]
 declare void @bzero(i8*, i64)
 
-; CHECK: declare noalias noundef i8* @calloc(i64, i64) [[INACCESSIBLEMEMONLY_NOFREE_NOUNWIND_WILLRETURN]]
+; CHECK: declare noalias noundef i8* @calloc(i64, i64) [[INACCESSIBLEMEMONLY_NOFREE_NOUNWIND_WILLRETURN:#[0-9]+]]
 declare i8* @calloc(i64, i64)
 
 ; CHECK: declare double @cbrt(double) [[NOFREE_NOUNWIND_WILLRETURN]]
@@ -478,7 +454,7 @@ declare i32 @fputs(i8*, %opaque*)
 ; CHECK: declare noundef i64 @fread(i8* nocapture noundef, i64 noundef, i64 noundef, %opaque* nocapture noundef) [[NOFREE_NOUNWIND]]
 declare i64 @fread(i8*, i64, i64, %opaque*)
 
-; CHECK: declare void @free(i8* nocapture noundef) [[INACCESSIBLEMEMORARGONLY_NOUNWIND_WILLRETURN:#[0-9]+]]
+; CHECK: declare void @free(i8* nocapture noundef) [[NOUNWIND:#[0-9]+]]
 declare void @free(i8*)
 
 ; CHECK: declare double @frexp(double, i32* nocapture) [[NOFREE_NOUNWIND_WILLRETURN]]
@@ -757,7 +733,7 @@ declare i64 @read(i32, i8*, i64)
 ; CHECK: declare noundef i64 @readlink(i8* nocapture noundef readonly, i8* nocapture noundef, i64 noundef) [[NOFREE_NOUNWIND]]
 declare i64 @readlink(i8*, i8*, i64)
 
-; CHECK: declare noalias noundef i8* @realloc(i8* nocapture, i64) [[INACCESSIBLEMEMORARGONLY_NOUNWIND_WILLRETURN]]
+; CHECK: declare noalias noundef i8* @realloc(i8* nocapture, i64) [[NOUNWIND]]
 declare i8* @realloc(i8*, i64)
 
 ; CHECK: declare noundef i8* @reallocf(i8*, i64)
@@ -1042,10 +1018,9 @@ declare i64 @write(i32, i8*, i64)
 declare void @memset_pattern16(i8*, i8*, i64)
 
 ; CHECK-DAG-UNKNOWN: attributes [[INACCESSIBLEMEMONLY_NOFREE_WILLRETURN]] = { inaccessiblememonly nofree willreturn }
-; CHECK-DAG-UNKNOWN: attributes [[INACCESSIBLEMEMONLY_NOFREE_NOUNWIND_WILLRETURN]] = { inaccessiblememonly nofree nounwind willreturn }
-; CHECK-DAG-UNKNOWN: attributes [[INACCESSIBLEMEMORARGONLY_WILLRETURN]] = { inaccessiblemem_or_argmemonly willreturn }
 ; CHECK-DAG-UNKNOWN: attributes [[NOFREE_NOUNWIND_WILLRETURN]] = { nofree nounwind willreturn }
 ; CHECK-DAG-UNKNOWN: attributes [[NOFREE_NOUNWIND]] = { nofree nounwind }
+; CHECK-DAG-UNKNOWN: attributes [[INACCESSIBLEMEMONLY_NOFREE_NOUNWIND_WILLRETURN]] = { inaccessiblememonly nofree nounwind willreturn }
 ; CHECK-DAG-UNKNOWN: attributes [[NOFREE_NOUNWIND_READONLY_WILLRETURN]]  = { nofree nounwind readonly willreturn }
 ; CHECK-DAG-UNKNOWN: attributes [[NOFREE_NOUNWIND_READONLY_WILLRETURN]] = { argmemonly nofree nounwind willreturn }
 ; CHECK-DAG-UNKNOWN: attributes [[NOFREE_NOUNWIND_READONLY]] = { nofree nounwind readonly }
@@ -1057,11 +1032,10 @@ declare void @memset_pattern16(i8*, i8*, i64)
 ; CHECK-DAG-UNKNOWN: attributes [[INACCESSIBLEMEMORARGONLY_NOFREE_NOUNWIND_WILLRETURN]]  = { inaccessiblemem_or_argmemonly nofree nounwind willreturn }
 
 ; CHECK-DAG-LINUX: attributes [[INACCESSIBLEMEMONLY_NOFREE_WILLRETURN]] = { inaccessiblememonly nofree willreturn }
-; CHECK-DAG-LINUX: attributes [[INACCESSIBLEMEMONLY_NOFREE_NOUNWIND_WILLRETURN]] = { inaccessiblememonly nofree nounwind willreturn }
-; CHECK-DAG-LINUX: attributes [[INACCESSIBLEMEMORARGONLY_WILLRETURN]] = { inaccessiblemem_or_argmemonly willreturn }
 ; CHECK-DAG-LINUX: attributes [[NOFREE]] = { nofree }
 ; CHECK-DAG-LINUX: attributes [[NOFREE_NOUNWIND_WILLRETURN]] = { nofree nounwind willreturn }
 ; CHECK-DAG-LINUX: attributes [[NOFREE_NOUNWIND]] = { nofree nounwind }
+; CHECK-DAG-LINUX: attributes [[INACCESSIBLEMEMONLY_NOFREE_NOUNWIND_WILLRETURN]] = { inaccessiblememonly nofree nounwind willreturn }
 ; CHECK-DAG-LINUX: attributes [[NOFREE_NOUNWIND_READONLY_WILLRETURN]]  = { nofree nounwind readonly willreturn }
 ; CHECK-DAG-LINUX: attributes [[ARGMEMONLY_NOFREE_NOUNWIND_READONLY_WILLRETURN]] = { argmemonly nofree nounwind readonly willreturn }
 ; CHECK-DAG-LINUX: attributes [[NOFREE_NOUNWIND_READONLY_WILLRETURN]] = { argmemonly nofree nounwind willreturn }

From 97dd9224f1033b1d6313ae80047e48ff964ca77d Mon Sep 17 00:00:00 2001
From: Richard Smith <richard@metafoo.co.uk>
Date: Thu, 4 Feb 2021 13:55:28 -0800
Subject: [PATCH 086/318] Don't infer attributes on '::operator new'.

These attributes were all incorrect or inappropriate for LLVM to infer:
- inaccessiblememonly is generally wrong; user replacement operator new
  can access memory that's visible to the caller, as can a new_handler
  function.
- willreturn is generally wrong; a custom new_handler is not guaranteed
  to terminate.
- noalias is inappropriate: Clang has a flag to determine whether this
  attribute should be present and adds it itself when appropriate.
- noundef and nonnull on the return value should be specified by the
  frontend on all 'operator new' functions if we want them, not here.

In any case, inferring attributes on functions declared 'nobuiltin' (as
these are when Clang emits them) seems questionable.

(cherry picked from commit ab243efb261ba7e27f4b14e1a6fbbff15a79c0bf)
---
 llvm/lib/Transforms/Utils/BuildLibCalls.cpp   | 25 -------------------
 .../Transforms/InferFunctionAttrs/annotate.ll |  6 -----
 2 files changed, 31 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index 811b9c04906d..dba5403f272a 100644
--- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -170,16 +170,6 @@ static bool setRetAndArgsNoUndef(Function &F) {
   return setRetNoUndef(F) | setArgsNoUndef(F);
 }
 
-static bool setRetNonNull(Function &F) {
-  assert(F.getReturnType()->isPointerTy() &&
-         "nonnull applies only to pointers");
-  if (F.hasAttribute(AttributeList::ReturnIndex, Attribute::NonNull))
-    return false;
-  F.addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
-  ++NumNonNull;
-  return true;
-}
-
 static bool setReturnedArg(Function &F, unsigned ArgNo) {
   if (F.hasParamAttribute(ArgNo, Attribute::Returned))
     return false;
@@ -1005,21 +995,6 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
-  case LibFunc_Znwj: // new(unsigned int)
-  case LibFunc_Znwm: // new(unsigned long)
-  case LibFunc_Znaj: // new[](unsigned int)
-  case LibFunc_Znam: // new[](unsigned long)
-  case LibFunc_msvc_new_int: // new(unsigned int)
-  case LibFunc_msvc_new_longlong: // new(unsigned long long)
-  case LibFunc_msvc_new_array_int: // new[](unsigned int)
-  case LibFunc_msvc_new_array_longlong: // new[](unsigned long long)
-    Changed |= setOnlyAccessesInaccessibleMemory(F);
-    // Operator new always returns a nonnull noalias pointer
-    Changed |= setRetNoUndef(F);
-    Changed |= setRetNonNull(F);
-    Changed |= setRetDoesNotAlias(F);
-    Changed |= setWillReturn(F);
-    return Changed;
   // TODO: add LibFunc entries for:
   // case LibFunc_memset_pattern4:
   // case LibFunc_memset_pattern8:
diff --git a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll
index 0af18151d6f6..5c6ec0b683ca 100644
--- a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll
+++ b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll
@@ -4,12 +4,6 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -inferattrs -S | FileCheck -check-prefix=CHECK -check-prefix=CHECK-LINUX %s
 ; RUN: opt < %s -mtriple=nvptx -inferattrs -S | FileCheck -check-prefix=CHECK-NVPTX %s
 
-; operator new routines
-declare i8* @_Znwj(i64 )
-; CHECK: declare noalias noundef nonnull i8* @_Znwj(i64) [[INACCESSIBLEMEMONLY_NOFREE_WILLRETURN:#[0-9]+]]
-declare i8* @_Znwm(i64)
-; CHECK: declare noalias noundef nonnull i8* @_Znwm(i64) [[INACCESSIBLEMEMONLY_NOFREE_WILLRETURN]]
-
 declare i32 @__nvvm_reflect(i8*)
 ; CHECK-NVPTX: declare noundef i32 @__nvvm_reflect(i8* noundef) [[NOFREE_NOUNWIND_READNONE:#[0-9]+]]
 ; CHECK-NVPTX: attributes [[NOFREE_NOUNWIND_READNONE]] = { nofree nounwind readnone }

From 3287b6f9d552f0c542df1c7c0504aad24faf4c53 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 1 Feb 2021 23:53:54 -0800
Subject: [PATCH 087/318] [RISCV] Replace NoX0 SDNodeXForm with a
 ComplexPattern to do the selection of the VL operand.

I think this is a more standard way of doing this.

Reviewed By: rogfer01

Differential Revision: https://reviews.llvm.org/D95833

(cherry picked from commit e7f9a834996f40be8dc46a0b059aa850f1f4ef05)
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp   |  17 ++
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h     |   2 +
 .../Target/RISCV/RISCVInstrInfoVPseudos.td    | 199 +++++++++---------
 3 files changed, 114 insertions(+), 104 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 7b0f38671f06..517e714cb59f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -946,6 +946,23 @@ bool RISCVDAGToDAGISel::MatchSLLIUW(SDNode *N) const {
   return (VC1 >> VC2) == UINT64_C(0xFFFFFFFF);
 }
 
+// X0 has special meaning for vsetvl/vsetvli.
+//  rd | rs1 |   AVL value | Effect on vl
+//--------------------------------------------------------------
+// !X0 |  X0 |       VLMAX | Set vl to VLMAX
+//  X0 |  X0 | Value in vl | Keep current vl, just change vtype.
+bool RISCVDAGToDAGISel::selectVLOp(SDValue N, SDValue &VL) {
+  // If the VL value is a constant 0, manually select it to an ADDI with 0
+  // immediate to prevent the default selection path from matching it to X0.
+  auto *C = dyn_cast<ConstantSDNode>(N);
+  if (C && C->isNullValue())
+    VL = SDValue(selectImm(CurDAG, SDLoc(N), 0, Subtarget->getXLenVT()), 0);
+  else
+    VL = N;
+
+  return true;
+}
+
 bool RISCVDAGToDAGISel::selectVSplat(SDValue N, SDValue &SplatVal) {
   if (N.getOpcode() != ISD::SPLAT_VECTOR &&
       N.getOpcode() != RISCVISD::SPLAT_VECTOR_I64)
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index 23601c3b8f06..0c58c5379e13 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -51,6 +51,8 @@ class RISCVDAGToDAGISel : public SelectionDAGISel {
   bool MatchSROIW(SDNode *N) const;
   bool MatchSLLIUW(SDNode *N) const;
 
+  bool selectVLOp(SDValue N, SDValue &VL);
+
   bool selectVSplat(SDValue N, SDValue &SplatVal);
   bool selectVSplatSimm5(SDValue N, SDValue &SplatVal);
   bool selectVSplatUimm5(SDValue N, SDValue &SplatVal);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 9fdfc2727d86..fa17f2d87eff 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -42,17 +42,7 @@ def riscv_read_vl : SDNode<"RISCVISD::READ_VL",
 //--------------------------------------------------------------
 // !X0 |  X0 |       VLMAX | Set vl to VLMAX
 //  X0 |  X0 | Value in vl | Keep current vl, just change vtype.
-def NoX0 : SDNodeXForm<undef,
-[{
-  auto *C = dyn_cast<ConstantSDNode>(N);
-  if (C && C->isNullValue()) {
-    SDLoc DL(N);
-    return SDValue(CurDAG->getMachineNode(RISCV::ADDI, DL, Subtarget->getXLenVT(),
-                   CurDAG->getRegister(RISCV::X0, Subtarget->getXLenVT()),
-                   CurDAG->getTargetConstant(0, DL, Subtarget->getXLenVT())), 0);
-  }
-  return SDValue(N, 0);
-}]>;
+def VLOp : ComplexPattern<XLenVT, 1, "selectVLOp">;
 
 def DecImm : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getSExtValue() - 1, SDLoc(N),
@@ -1951,10 +1941,10 @@ class VPatUnaryNoMask<string intrinsic_name,
                       VReg op2_reg_class> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
                    (op2_type op2_reg_class:$rs2),
-                   (XLenVT GPR:$vl))),
+                   (XLenVT (VLOp GPR:$vl)))),
                    (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX)
                    (op2_type op2_reg_class:$rs2),
-                   (NoX0 GPR:$vl), sew)>;
+                   GPR:$vl, sew)>;
 
 class VPatUnaryMask<string intrinsic_name,
                     string inst,
@@ -1970,21 +1960,21 @@ class VPatUnaryMask<string intrinsic_name,
                    (result_type result_reg_class:$merge),
                    (op2_type op2_reg_class:$rs2),
                    (mask_type V0),
-                   (XLenVT GPR:$vl))),
+                   (XLenVT (VLOp GPR:$vl)))),
                    (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX#"_MASK")
                    (result_type result_reg_class:$merge),
                    (op2_type op2_reg_class:$rs2),
-                   (mask_type V0), (NoX0 GPR:$vl), sew)>;
+                   (mask_type V0), GPR:$vl, sew)>;
 
 class VPatMaskUnaryNoMask<string intrinsic_name,
                           string inst,
                           MTypeInfo mti> :
   Pat<(mti.Mask (!cast<Intrinsic>(intrinsic_name)
                 (mti.Mask VR:$rs2),
-                (XLenVT GPR:$vl))),
+                (XLenVT (VLOp GPR:$vl)))),
                 (!cast<Instruction>(inst#"_M_"#mti.BX)
                 (mti.Mask VR:$rs2),
-                (NoX0 GPR:$vl), mti.SEW)>;
+                GPR:$vl, mti.SEW)>;
 
 class VPatMaskUnaryMask<string intrinsic_name,
                         string inst,
@@ -1993,11 +1983,11 @@ class VPatMaskUnaryMask<string intrinsic_name,
                 (mti.Mask VR:$merge),
                 (mti.Mask VR:$rs2),
                 (mti.Mask V0),
-                (XLenVT GPR:$vl))),
+                (XLenVT (VLOp GPR:$vl)))),
                 (!cast<Instruction>(inst#"_M_"#mti.BX#"_MASK")
                 (mti.Mask VR:$merge),
                 (mti.Mask VR:$rs2),
-                (mti.Mask V0), (NoX0 GPR:$vl), mti.SEW)>;
+                (mti.Mask V0), GPR:$vl, mti.SEW)>;
 
 class VPatUnaryAnyMask<string intrinsic,
                        string inst,
@@ -2013,12 +2003,12 @@ class VPatUnaryAnyMask<string intrinsic,
                    (result_type result_reg_class:$merge),
                    (op1_type op1_reg_class:$rs1),
                    (mask_type VR:$rs2),
-                   (XLenVT GPR:$vl))),
+                   (XLenVT (VLOp GPR:$vl)))),
                    (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX)
                    (result_type result_reg_class:$merge),
                    (op1_type op1_reg_class:$rs1),
                    (mask_type VR:$rs2),
-                   (NoX0 GPR:$vl), sew)>;
+                   GPR:$vl, sew)>;
 
 class VPatBinaryNoMask<string intrinsic_name,
                        string inst,
@@ -2031,11 +2021,11 @@ class VPatBinaryNoMask<string intrinsic_name,
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
-                   (XLenVT GPR:$vl))),
+                   (XLenVT (VLOp GPR:$vl)))),
                    (!cast<Instruction>(inst)
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
-                   (NoX0 GPR:$vl), sew)>;
+                   GPR:$vl, sew)>;
 
 class VPatBinaryMask<string intrinsic_name,
                      string inst,
@@ -2052,12 +2042,12 @@ class VPatBinaryMask<string intrinsic_name,
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
                    (mask_type V0),
-                   (XLenVT GPR:$vl))),
+                   (XLenVT (VLOp GPR:$vl)))),
                    (!cast<Instruction>(inst#"_MASK")
                    (result_type result_reg_class:$merge),
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
-                   (mask_type V0), (NoX0 GPR:$vl), sew)>;
+                   (mask_type V0), GPR:$vl, sew)>;
 
 class VPatTernaryNoMask<string intrinsic,
                         string inst,
@@ -2075,12 +2065,12 @@ class VPatTernaryNoMask<string intrinsic,
                     (result_type result_reg_class:$rs3),
                     (op1_type op1_reg_class:$rs1),
                     (op2_type op2_kind:$rs2),
-                    (XLenVT GPR:$vl))),
+                    (XLenVT (VLOp GPR:$vl)))),
                    (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX)
                     result_reg_class:$rs3,
                     (op1_type op1_reg_class:$rs1),
                     op2_kind:$rs2,
-                    (NoX0 GPR:$vl), sew)>;
+                    GPR:$vl, sew)>;
 
 class VPatTernaryMask<string intrinsic,
                       string inst,
@@ -2099,13 +2089,13 @@ class VPatTernaryMask<string intrinsic,
                     (op1_type op1_reg_class:$rs1),
                     (op2_type op2_kind:$rs2),
                     (mask_type V0),
-                    (XLenVT GPR:$vl))),
+                    (XLenVT (VLOp GPR:$vl)))),
                    (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX # "_MASK")
                     result_reg_class:$rs3,
                     (op1_type op1_reg_class:$rs1),
                     op2_kind:$rs2,
                     (mask_type V0),
-                    (NoX0 GPR:$vl), sew)>;
+                    GPR:$vl, sew)>;
 
 class VPatAMOWDNoMask<string intrinsic_name,
                     string inst,
@@ -2119,10 +2109,10 @@ class VPatAMOWDNoMask<string intrinsic_name,
                     GPR:$rs1,
                     (op1_type op1_reg_class:$vs2),
                     (result_type vlmul.vrclass:$vd),
-                    (XLenVT GPR:$vl))),
+                    (XLenVT (VLOp GPR:$vl)))),
                    (!cast<Instruction>(inst # "_WD_" # vlmul.MX # "_" # emul.MX)
                     $rs1, $vs2, $vd,
-                    (NoX0 GPR:$vl), sew)>;
+                    GPR:$vl, sew)>;
 
 class VPatAMOWDMask<string intrinsic_name,
                     string inst,
@@ -2138,10 +2128,10 @@ class VPatAMOWDMask<string intrinsic_name,
                     (op1_type op1_reg_class:$vs2),
                     (result_type vlmul.vrclass:$vd),
                     (mask_type V0),
-                    (XLenVT GPR:$vl))),
+                    (XLenVT (VLOp GPR:$vl)))),
                    (!cast<Instruction>(inst # "_WD_" # vlmul.MX # "_" # emul.MX # "_MASK")
                     $rs1, $vs2, $vd,
-                    (mask_type V0), (NoX0 GPR:$vl), sew)>;
+                    (mask_type V0), GPR:$vl, sew)>;
 
 multiclass VPatUSLoad<string intrinsic,
                       string inst,
@@ -2153,14 +2143,14 @@ multiclass VPatUSLoad<string intrinsic,
 {
     defvar Intr = !cast<Intrinsic>(intrinsic);
     defvar Pseudo = !cast<Instruction>(inst#"_V_"#vlmul.MX);
-    def : Pat<(type (Intr GPR:$rs1, GPR:$vl)),
-                    (Pseudo $rs1, (NoX0 GPR:$vl), sew)>;
+    def : Pat<(type (Intr GPR:$rs1, (XLenVT (VLOp GPR:$vl)))),
+                    (Pseudo $rs1, GPR:$vl, sew)>;
     defvar IntrMask = !cast<Intrinsic>(intrinsic # "_mask");
     defvar PseudoMask = !cast<Instruction>(inst#"_V_"#vlmul.MX#"_MASK");
     def : Pat<(type (IntrMask (type GetVRegNoV0<reg_class>.R:$merge),
-                               GPR:$rs1, (mask_type V0), GPR:$vl)),
+                               GPR:$rs1, (mask_type V0), (XLenVT (VLOp GPR:$vl)))),
                     (PseudoMask $merge,
-                                $rs1, (mask_type V0), (NoX0 GPR:$vl), sew)>;
+                                $rs1, (mask_type V0), GPR:$vl, sew)>;
 }
 
 multiclass VPatUSLoadFF<string inst,
@@ -2171,13 +2161,13 @@ multiclass VPatUSLoadFF<string inst,
                         VReg reg_class>
 {
     defvar Pseudo = !cast<Instruction>(inst#"_V_"#vlmul.MX);
-    def : Pat<(type (riscv_vleff GPR:$rs1, GPR:$vl)),
-                    (Pseudo $rs1, (NoX0 GPR:$vl), sew)>;
+    def : Pat<(type (riscv_vleff GPR:$rs1, (XLenVT (VLOp GPR:$vl)))),
+                    (Pseudo $rs1, GPR:$vl, sew)>;
     defvar PseudoMask = !cast<Instruction>(inst#"_V_"#vlmul.MX#"_MASK");
     def : Pat<(type (riscv_vleff_mask (type GetVRegNoV0<reg_class>.R:$merge),
-                                      GPR:$rs1, (mask_type V0), GPR:$vl)),
+                                      GPR:$rs1, (mask_type V0), (XLenVT (VLOp GPR:$vl)))),
                     (PseudoMask $merge,
-                                $rs1, (mask_type V0), (NoX0 GPR:$vl), sew)>;
+                                $rs1, (mask_type V0), GPR:$vl, sew)>;
 }
 
 multiclass VPatSLoad<string intrinsic,
@@ -2190,14 +2180,14 @@ multiclass VPatSLoad<string intrinsic,
 {
     defvar Intr = !cast<Intrinsic>(intrinsic);
     defvar Pseudo = !cast<Instruction>(inst#"_V_"#vlmul.MX);
-    def : Pat<(type (Intr GPR:$rs1, GPR:$rs2, GPR:$vl)),
-                    (Pseudo $rs1, $rs2, (NoX0 GPR:$vl), sew)>;
+    def : Pat<(type (Intr GPR:$rs1, GPR:$rs2, (XLenVT (VLOp GPR:$vl)))),
+                    (Pseudo $rs1, $rs2, GPR:$vl, sew)>;
     defvar IntrMask = !cast<Intrinsic>(intrinsic # "_mask");
     defvar PseudoMask = !cast<Instruction>(inst#"_V_"#vlmul.MX#"_MASK");
     def : Pat<(type (IntrMask (type GetVRegNoV0<reg_class>.R:$merge),
-                               GPR:$rs1, GPR:$rs2, (mask_type V0), GPR:$vl)),
+                               GPR:$rs1, GPR:$rs2, (mask_type V0), (XLenVT (VLOp GPR:$vl)))),
                     (PseudoMask $merge,
-                                $rs1, $rs2, (mask_type V0), (NoX0 GPR:$vl), sew)>;
+                                $rs1, $rs2, (mask_type V0), GPR:$vl, sew)>;
 }
 
 multiclass VPatILoad<string intrinsic,
@@ -2213,16 +2203,16 @@ multiclass VPatILoad<string intrinsic,
 {
     defvar Intr = !cast<Intrinsic>(intrinsic);
     defvar Pseudo = !cast<Instruction>(inst#"_V_"#idx_vlmul.MX#"_"#vlmul.MX);
-    def : Pat<(type (Intr GPR:$rs1, (idx_type idx_reg_class:$rs2), GPR:$vl)),
-                    (Pseudo $rs1, $rs2, (NoX0 GPR:$vl), sew)>;
+    def : Pat<(type (Intr GPR:$rs1, (idx_type idx_reg_class:$rs2), (XLenVT (VLOp GPR:$vl)))),
+                    (Pseudo $rs1, $rs2, GPR:$vl, sew)>;
 
     defvar IntrMask = !cast<Intrinsic>(intrinsic # "_mask");
     defvar PseudoMask = !cast<Instruction>(inst#"_V_"#idx_vlmul.MX#"_"#vlmul.MX#"_MASK");
     def : Pat<(type (IntrMask (type GetVRegNoV0<reg_class>.R:$merge),
                                GPR:$rs1, (idx_type idx_reg_class:$rs2),
-                               (mask_type V0), GPR:$vl)),
+                               (mask_type V0), (XLenVT (VLOp GPR:$vl)))),
                     (PseudoMask $merge,
-                                $rs1, $rs2, (mask_type V0), (NoX0 GPR:$vl), sew)>;
+                                $rs1, $rs2, (mask_type V0), GPR:$vl, sew)>;
 }
 
 multiclass VPatUSStore<string intrinsic,
@@ -2235,12 +2225,12 @@ multiclass VPatUSStore<string intrinsic,
 {
     defvar Intr = !cast<Intrinsic>(intrinsic);
     defvar Pseudo = !cast<Instruction>(inst#"_V_"#vlmul.MX);
-    def : Pat<(Intr (type reg_class:$rs3), GPR:$rs1, GPR:$vl),
-                    (Pseudo $rs3, $rs1, (NoX0 GPR:$vl), sew)>;
+    def : Pat<(Intr (type reg_class:$rs3), GPR:$rs1, (XLenVT (VLOp GPR:$vl))),
+                    (Pseudo $rs3, $rs1, GPR:$vl, sew)>;
     defvar IntrMask = !cast<Intrinsic>(intrinsic # "_mask");
     defvar PseudoMask = !cast<Instruction>(inst#"_V_"#vlmul.MX#"_MASK");
-    def : Pat<(IntrMask (type reg_class:$rs3), GPR:$rs1, (mask_type V0), GPR:$vl),
-              (PseudoMask $rs3, $rs1, (mask_type V0), (NoX0 GPR:$vl), sew)>;
+    def : Pat<(IntrMask (type reg_class:$rs3), GPR:$rs1, (mask_type V0), (XLenVT (VLOp GPR:$vl))),
+              (PseudoMask $rs3, $rs1, (mask_type V0), GPR:$vl, sew)>;
 }
 
 multiclass VPatSStore<string intrinsic,
@@ -2253,12 +2243,12 @@ multiclass VPatSStore<string intrinsic,
 {
     defvar Intr = !cast<Intrinsic>(intrinsic);
     defvar Pseudo = !cast<Instruction>(inst#"_V_"#vlmul.MX);
-    def : Pat<(Intr (type reg_class:$rs3), GPR:$rs1, GPR:$rs2, GPR:$vl),
-                    (Pseudo $rs3, $rs1, $rs2, (NoX0 GPR:$vl), sew)>;
+    def : Pat<(Intr (type reg_class:$rs3), GPR:$rs1, GPR:$rs2, (XLenVT (VLOp GPR:$vl))),
+                    (Pseudo $rs3, $rs1, $rs2, GPR:$vl, sew)>;
     defvar IntrMask = !cast<Intrinsic>(intrinsic # "_mask");
     defvar PseudoMask = !cast<Instruction>(inst#"_V_"#vlmul.MX#"_MASK");
-    def : Pat<(IntrMask (type reg_class:$rs3), GPR:$rs1, GPR:$rs2, (mask_type V0), GPR:$vl),
-              (PseudoMask $rs3, $rs1, $rs2, (mask_type V0), (NoX0 GPR:$vl), sew)>;
+    def : Pat<(IntrMask (type reg_class:$rs3), GPR:$rs1, GPR:$rs2, (mask_type V0), (XLenVT (VLOp GPR:$vl))),
+              (PseudoMask $rs3, $rs1, $rs2, (mask_type V0), GPR:$vl, sew)>;
 }
 
 multiclass VPatIStore<string intrinsic,
@@ -2275,13 +2265,13 @@ multiclass VPatIStore<string intrinsic,
     defvar Intr = !cast<Intrinsic>(intrinsic);
     defvar Pseudo = !cast<Instruction>(inst#"_V_"#idx_vlmul.MX#"_"#vlmul.MX);
     def : Pat<(Intr (type reg_class:$rs3), GPR:$rs1,
-                    (idx_type idx_reg_class:$rs2), GPR:$vl),
-              (Pseudo $rs3, $rs1, $rs2, (NoX0 GPR:$vl), sew)>;
+                    (idx_type idx_reg_class:$rs2), (XLenVT (VLOp GPR:$vl))),
+              (Pseudo $rs3, $rs1, $rs2, GPR:$vl, sew)>;
     defvar IntrMask = !cast<Intrinsic>(intrinsic # "_mask");
     defvar PseudoMask = !cast<Instruction>(inst#"_V_"#idx_vlmul.MX#"_"#vlmul.MX#"_MASK");
     def : Pat<(IntrMask (type reg_class:$rs3), GPR:$rs1,
-                        (idx_type idx_reg_class:$rs2), (mask_type V0), GPR:$vl),
-              (PseudoMask $rs3, $rs1, $rs2, (mask_type V0), (NoX0 GPR:$vl), sew)>;
+                        (idx_type idx_reg_class:$rs2), (mask_type V0), (XLenVT (VLOp GPR:$vl))),
+              (PseudoMask $rs3, $rs1, $rs2, (mask_type V0), GPR:$vl, sew)>;
 }
 
 multiclass VPatUnaryS_M<string intrinsic_name,
@@ -2289,13 +2279,13 @@ multiclass VPatUnaryS_M<string intrinsic_name,
 {
   foreach mti = AllMasks in {
     def : Pat<(XLenVT (!cast<Intrinsic>(intrinsic_name)
-                      (mti.Mask VR:$rs1), GPR:$vl)),
+                      (mti.Mask VR:$rs1), (XLenVT (VLOp GPR:$vl)))),
                       (!cast<Instruction>(inst#"_M_"#mti.BX) $rs1,
-                      (NoX0 GPR:$vl), mti.SEW)>;
+                      GPR:$vl, mti.SEW)>;
     def : Pat<(XLenVT (!cast<Intrinsic>(intrinsic_name # "_mask")
-                      (mti.Mask VR:$rs1), (mti.Mask V0), GPR:$vl)),
+                      (mti.Mask VR:$rs1), (mti.Mask V0), (XLenVT (VLOp GPR:$vl)))),
                       (!cast<Instruction>(inst#"_M_"#mti.BX#"_MASK") $rs1,
-                      (mti.Mask V0), (NoX0 GPR:$vl), mti.SEW)>;
+                      (mti.Mask V0), GPR:$vl, mti.SEW)>;
   }
 }
 
@@ -2360,24 +2350,24 @@ multiclass VPatNullaryV<string intrinsic, string instruction>
 {
   foreach vti = AllIntegerVectors in {
     def : Pat<(vti.Vector (!cast<Intrinsic>(intrinsic)
-                          (XLenVT GPR:$vl))),
+                          (XLenVT (VLOp GPR:$vl)))),
                           (!cast<Instruction>(instruction#"_V_" # vti.LMul.MX)
-                          (NoX0 GPR:$vl), vti.SEW)>;
+                          GPR:$vl, vti.SEW)>;
     def : Pat<(vti.Vector (!cast<Intrinsic>(intrinsic # "_mask")
                           (vti.Vector vti.RegClass:$merge),
-                          (vti.Mask V0), (XLenVT GPR:$vl))),
+                          (vti.Mask V0), (XLenVT (VLOp GPR:$vl)))),
                           (!cast<Instruction>(instruction#"_V_" # vti.LMul.MX # "_MASK")
                           vti.RegClass:$merge, (vti.Mask V0),
-                          (NoX0 GPR:$vl), vti.SEW)>;
+                          GPR:$vl, vti.SEW)>;
   }
 }
 
 multiclass VPatNullaryM<string intrinsic, string inst> {
   foreach mti = AllMasks in
     def : Pat<(mti.Mask (!cast<Intrinsic>(intrinsic)
-                        (XLenVT GPR:$vl))),
+                        (XLenVT (VLOp GPR:$vl)))),
                         (!cast<Instruction>(inst#"_M_"#mti.BX)
-                        (NoX0 GPR:$vl), mti.SEW)>;
+                        GPR:$vl, mti.SEW)>;
 }
 
 multiclass VPatBinary<string intrinsic,
@@ -2414,11 +2404,11 @@ multiclass VPatBinaryCarryIn<string intrinsic,
                          (op1_type op1_reg_class:$rs1),
                          (op2_type op2_kind:$rs2),
                          (mask_type V0),
-                         (XLenVT GPR:$vl))),
+                         (XLenVT (VLOp GPR:$vl)))),
                          (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX)
                          (op1_type op1_reg_class:$rs1),
                          (op2_type op2_kind:$rs2),
-                         (mask_type V0), (NoX0 GPR:$vl), sew)>;
+                         (mask_type V0), GPR:$vl, sew)>;
 }
 
 multiclass VPatBinaryMaskOut<string intrinsic,
@@ -2435,11 +2425,11 @@ multiclass VPatBinaryMaskOut<string intrinsic,
   def : Pat<(result_type (!cast<Intrinsic>(intrinsic)
                          (op1_type op1_reg_class:$rs1),
                          (op2_type op2_kind:$rs2),
-                         (XLenVT GPR:$vl))),
+                         (XLenVT (VLOp GPR:$vl)))),
                          (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX)
                          (op1_type op1_reg_class:$rs1),
                          (op2_type op2_kind:$rs2),
-                         (NoX0 GPR:$vl), sew)>;
+                         GPR:$vl, sew)>;
 }
 
 multiclass VPatConversion<string intrinsic,
@@ -3886,62 +3876,63 @@ defm "" : VPatBinaryM_VX_VI<"int_riscv_vmsgt", "PseudoVMSGT", AllIntegerVectors>
 // instruction.
 foreach vti = AllIntegerVectors in {
   def : Pat<(vti.Mask (int_riscv_vmslt (vti.Vector vti.RegClass:$rs1),
-                                       (vti.Scalar simm5_plus1:$rs2), GPR:$vl)),
+                                       (vti.Scalar simm5_plus1:$rs2), (XLenVT (VLOp GPR:$vl)))),
             (!cast<Instruction>("PseudoVMSLE_VI_"#vti.LMul.MX) vti.RegClass:$rs1,
                                                                (DecImm simm5_plus1:$rs2),
-                                                               (NoX0 GPR:$vl),
+                                                               GPR:$vl,
                                                                vti.SEW)>;
   def : Pat<(vti.Mask (int_riscv_vmslt_mask (vti.Mask V0),
                                             (vti.Vector vti.RegClass:$rs1),
                                             (vti.Scalar simm5_plus1:$rs2),
                                             (vti.Mask VR:$merge),
-                                            GPR:$vl)),
+                                            (XLenVT (VLOp GPR:$vl)))),
             (!cast<Instruction>("PseudoVMSLE_VI_"#vti.LMul.MX#"_MASK")
                                                       VR:$merge,
                                                       vti.RegClass:$rs1,
                                                       (DecImm simm5_plus1:$rs2),
                                                       (vti.Mask V0),
-                                                      (NoX0 GPR:$vl),
+                                                      GPR:$vl,
                                                       vti.SEW)>;
 
-  def : Pat<(vti.Mask (int_riscv_vmsltu (vti.Vector vti.RegClass:$rs1),
-                                        (vti.Scalar simm5_plus1:$rs2), GPR:$vl)),
+ def : Pat<(vti.Mask (int_riscv_vmsltu (vti.Vector vti.RegClass:$rs1),
+                                        (vti.Scalar simm5_plus1:$rs2),
+                                        (XLenVT (VLOp GPR:$vl)))),
             (!cast<Instruction>("PseudoVMSLEU_VI_"#vti.LMul.MX) vti.RegClass:$rs1,
                                                                 (DecImm simm5_plus1:$rs2),
-                                                                (NoX0 GPR:$vl),
+                                                                GPR:$vl,
                                                                 vti.SEW)>;
   def : Pat<(vti.Mask (int_riscv_vmsltu_mask (vti.Mask V0),
                                              (vti.Vector vti.RegClass:$rs1),
                                              (vti.Scalar simm5_plus1:$rs2),
                                              (vti.Mask VR:$merge),
-                                             GPR:$vl)),
+                                             (XLenVT (VLOp GPR:$vl)))),
             (!cast<Instruction>("PseudoVMSLEU_VI_"#vti.LMul.MX#"_MASK")
                                                       VR:$merge,
                                                       vti.RegClass:$rs1,
                                                       (DecImm simm5_plus1:$rs2),
                                                       (vti.Mask V0),
-                                                      (NoX0 GPR:$vl),
+                                                      GPR:$vl,
                                                       vti.SEW)>;
 
   // Special cases to avoid matching vmsltu.vi 0 (always false) to
   // vmsleu.vi -1 (always true). Instead match to vmsne.vv.
   def : Pat<(vti.Mask (int_riscv_vmsltu (vti.Vector vti.RegClass:$rs1),
-                                        (vti.Scalar 0), GPR:$vl)),
+                                        (vti.Scalar 0), (XLenVT (VLOp GPR:$vl)))),
             (!cast<Instruction>("PseudoVMSNE_VV_"#vti.LMul.MX) vti.RegClass:$rs1,
                                                                vti.RegClass:$rs1,
-                                                               (NoX0 GPR:$vl),
+                                                               GPR:$vl,
                                                                vti.SEW)>;
   def : Pat<(vti.Mask (int_riscv_vmsltu_mask (vti.Mask V0),
                                             (vti.Vector vti.RegClass:$rs1),
                                             (vti.Scalar 0),
                                             (vti.Mask VR:$merge),
-                                            GPR:$vl)),
+                                            (XLenVT (VLOp GPR:$vl)))),
             (!cast<Instruction>("PseudoVMSNE_VV_"#vti.LMul.MX#"_MASK")
                                                      VR:$merge,
                                                      vti.RegClass:$rs1,
                                                      vti.RegClass:$rs1,
                                                      (vti.Mask V0),
-                                                     (NoX0 GPR:$vl),
+                                                     GPR:$vl,
                                                      vti.SEW)>;
 }
 
@@ -4002,18 +3993,18 @@ defm "" : VPatBinaryV_VM_XM_IM<"int_riscv_vmerge", "PseudoVMERGE">;
 //===----------------------------------------------------------------------===//
 foreach vti = AllVectors in {
   def : Pat<(vti.Vector (int_riscv_vmv_v_v (vti.Vector vti.RegClass:$rs1),
-                                           GPR:$vl)),
+                                           (XLenVT (VLOp GPR:$vl)))),
             (!cast<Instruction>("PseudoVMV_V_V_"#vti.LMul.MX)
-             $rs1, (NoX0 GPR:$vl), vti.SEW)>;
+             $rs1, GPR:$vl, vti.SEW)>;
 }
 
 foreach vti = AllIntegerVectors in {
-  def : Pat<(vti.Vector (int_riscv_vmv_v_x GPR:$rs2, GPR:$vl)),
+  def : Pat<(vti.Vector (int_riscv_vmv_v_x GPR:$rs2, (XLenVT (VLOp GPR:$vl)))),
             (!cast<Instruction>("PseudoVMV_V_X_"#vti.LMul.MX)
-             $rs2, (NoX0 GPR:$vl), vti.SEW)>;
-  def : Pat<(vti.Vector (int_riscv_vmv_v_x simm5:$imm5, GPR:$vl)),
+             $rs2, GPR:$vl, vti.SEW)>;
+  def : Pat<(vti.Vector (int_riscv_vmv_v_x simm5:$imm5, (XLenVT (VLOp GPR:$vl)))),
             (!cast<Instruction>("PseudoVMV_V_I_"#vti.LMul.MX)
-             simm5:$imm5, (NoX0 GPR:$vl), vti.SEW)>;
+             simm5:$imm5, GPR:$vl, vti.SEW)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -4157,8 +4148,8 @@ foreach fvti = AllFloatVectors in {
   defvar instr = !cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX);
   def : Pat<(fvti.Vector (int_riscv_vfmerge (fvti.Vector fvti.RegClass:$rs2),
                                             (fvti.Scalar (fpimm0)),
-                                            (fvti.Mask V0), (XLenVT GPR:$vl))),
-            (instr fvti.RegClass:$rs2, 0, (fvti.Mask V0), (NoX0 GPR:$vl), fvti.SEW)>;
+                                            (fvti.Mask V0), (XLenVT (VLOp GPR:$vl)))),
+            (instr fvti.RegClass:$rs2, 0, (fvti.Mask V0), GPR:$vl, fvti.SEW)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -4167,16 +4158,16 @@ foreach fvti = AllFloatVectors in {
 foreach fvti = AllFloatVectors in {
   // If we're splatting fpimm0, use vmv.v.x vd, x0.
   def : Pat<(fvti.Vector (int_riscv_vfmv_v_f
-                         (fvti.Scalar (fpimm0)), GPR:$vl)),
+                         (fvti.Scalar (fpimm0)), (XLenVT (VLOp GPR:$vl)))),
             (!cast<Instruction>("PseudoVMV_V_I_"#fvti.LMul.MX)
-             0, (NoX0 GPR:$vl), fvti.SEW)>;
+             0, GPR:$vl, fvti.SEW)>;
 
   def : Pat<(fvti.Vector (int_riscv_vfmv_v_f
-                         (fvti.Scalar fvti.ScalarRegClass:$rs2), GPR:$vl)),
+                         (fvti.Scalar fvti.ScalarRegClass:$rs2), (XLenVT (VLOp GPR:$vl)))),
             (!cast<Instruction>("PseudoVFMV_V_" # fvti.ScalarSuffix # "_" #
                                 fvti.LMul.MX)
              (fvti.Scalar fvti.ScalarRegClass:$rs2),
-             (NoX0 GPR:$vl), fvti.SEW)>;
+             GPR:$vl, fvti.SEW)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -4321,9 +4312,9 @@ foreach vti = AllIntegerVectors in {
   def : Pat<(riscv_vmv_x_s (vti.Vector vti.RegClass:$rs2)),
             (!cast<Instruction>("PseudoVMV_X_S_" # vti.LMul.MX) $rs2, vti.SEW)>;
   def : Pat<(vti.Vector (int_riscv_vmv_s_x (vti.Vector vti.RegClass:$rs1),
-                                           GPR:$rs2, GPR:$vl)),
+                                           GPR:$rs2, (XLenVT (VLOp GPR:$vl)))),
             (!cast<Instruction>("PseudoVMV_S_X_" # vti.LMul.MX)
-             (vti.Vector $rs1), $rs2, (NoX0 GPR:$vl), vti.SEW)>;
+             (vti.Vector $rs1), $rs2, GPR:$vl, vti.SEW)>;
 }
 } // Predicates = [HasStdExtV]
 
@@ -4339,12 +4330,12 @@ foreach fvti = AllFloatVectors in {
                          (instr $rs2, fvti.SEW)>;
 
   def : Pat<(fvti.Vector (int_riscv_vfmv_s_f (fvti.Vector fvti.RegClass:$rs1),
-                         (fvti.Scalar fvti.ScalarRegClass:$rs2), GPR:$vl)),
+                         (fvti.Scalar fvti.ScalarRegClass:$rs2), (XLenVT (VLOp GPR:$vl)))),
             (!cast<Instruction>("PseudoVFMV_S_"#fvti.ScalarSuffix#"_" #
                                 fvti.LMul.MX)
              (fvti.Vector $rs1),
              (fvti.Scalar fvti.ScalarRegClass:$rs2),
-             (NoX0 GPR:$vl), fvti.SEW)>;
+             GPR:$vl, fvti.SEW)>;
 }
 } // Predicates = [HasStdExtV, HasStdExtF]
 

From ef27138bb6b59ac28b28efdd1e192724ad94a1fa Mon Sep 17 00:00:00 2001
From: Hsiangkai Wang <kai.wang@sifive.com>
Date: Mon, 1 Feb 2021 16:08:46 +0800
Subject: [PATCH 088/318] [RISCV] Add new vector instructions in v0.10.

* Add new vector instructions in v0.10.
 - load/store for mask value vle1.v vse1.v
 - vsetivli for 0-31 immediate vector length.
* Rename vector instructions in v0.10.
 - vfrsqrte7 -> vfrsqrt7
 - vfrece7 -> vfrec7
* Reserve memory width encodings for EEW>128b.

Differential Revision: https://reviews.llvm.org/D95781

(cherry picked from commit c7189ba78578d029e0162720319de3c1c6fc348b)
---
 llvm/include/llvm/IR/IntrinsicsRISCV.td       |    7 +-
 llvm/lib/Target/RISCV/RISCVCleanupVSETVLI.cpp |   37 +-
 .../Target/RISCV/RISCVExpandPseudoInsts.cpp   |   11 +-
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp   |   14 +-
 llvm/lib/Target/RISCV/RISCVInstrFormatsV.td   |   23 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoV.td      |   98 +-
 .../Target/RISCV/RISCVInstrInfoVPseudos.td    |   38 +-
 .../CodeGen/RISCV/rvv/cleanup-vsetivli.mir    |   46 +
 .../RISCV/rvv/rv32-vsetvli-intrinsics.ll      |    3 +-
 .../RISCV/rvv/rv64-vsetvli-intrinsics.ll      |    3 +-
 llvm/test/CodeGen/RISCV/rvv/vfrec7-rv32.ll    |  602 ++++++
 llvm/test/CodeGen/RISCV/rvv/vfrec7-rv64.ll    |  602 ++++++
 llvm/test/CodeGen/RISCV/rvv/vfrsqrt7-rv32.ll  |  602 ++++++
 llvm/test/CodeGen/RISCV/rvv/vfrsqrt7-rv64.ll  |  602 ++++++
 llvm/test/CodeGen/RISCV/rvv/vle1-rv32.ll      |   94 +
 llvm/test/CodeGen/RISCV/rvv/vle1-rv64.ll      |   94 +
 llvm/test/CodeGen/RISCV/rvv/vse1-rv32.ll      |   94 +
 llvm/test/CodeGen/RISCV/rvv/vse1-rv64.ll      |   94 +
 llvm/test/CodeGen/RISCV/vfrece7-rv32.ll       |  602 ------
 llvm/test/CodeGen/RISCV/vfrece7-rv64.ll       |  602 ------
 llvm/test/CodeGen/RISCV/vfrsqrte7-rv32.ll     |  602 ------
 llvm/test/CodeGen/RISCV/vfrsqrte7-rv64.ll     |  602 ------
 llvm/test/MC/RISCV/rvv/fothers.s              |   16 +-
 llvm/test/MC/RISCV/rvv/invalid.s              |    9 +
 llvm/test/MC/RISCV/rvv/load.s                 |  246 +--
 llvm/test/MC/RISCV/rvv/store.s                |  102 +-
 llvm/test/MC/RISCV/rvv/vsetvl.s               |   18 +
 llvm/test/MC/RISCV/rvv/zvlsseg.s              | 1680 -----------------
 28 files changed, 3003 insertions(+), 4540 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/cleanup-vsetivli.mir
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/vfrec7-rv32.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/vfrec7-rv64.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/vfrsqrt7-rv32.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/vfrsqrt7-rv64.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/vle1-rv32.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/vle1-rv64.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/vse1-rv32.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/vse1-rv64.ll
 delete mode 100644 llvm/test/CodeGen/RISCV/vfrece7-rv32.ll
 delete mode 100644 llvm/test/CodeGen/RISCV/vfrece7-rv64.ll
 delete mode 100644 llvm/test/CodeGen/RISCV/vfrsqrte7-rv32.ll
 delete mode 100644 llvm/test/CodeGen/RISCV/vfrsqrte7-rv64.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsRISCV.td b/llvm/include/llvm/IR/IntrinsicsRISCV.td
index ab5b09b72ac3..c4056895f68e 100644
--- a/llvm/include/llvm/IR/IntrinsicsRISCV.td
+++ b/llvm/include/llvm/IR/IntrinsicsRISCV.td
@@ -790,6 +790,9 @@ let TargetPrefix = "riscv" in {
   defm vsoxei : RISCVIStore;
   defm vsuxei : RISCVIStore;
 
+  def int_riscv_vle1 : RISCVUSLoad;
+  def int_riscv_vse1 : RISCVUSStore;
+
   defm vamoswap : RISCVAMO;
   defm vamoadd : RISCVAMO;
   defm vamoxor : RISCVAMO;
@@ -940,8 +943,8 @@ let TargetPrefix = "riscv" in {
   defm vfwnmsac : RISCVTernaryWide;
 
   defm vfsqrt : RISCVUnaryAA;
-  defm vfrsqrte7 : RISCVUnaryAA;
-  defm vfrece7 : RISCVUnaryAA;
+  defm vfrsqrt7 : RISCVUnaryAA;
+  defm vfrec7 : RISCVUnaryAA;
 
   defm vfmin : RISCVBinaryAAX;
   defm vfmax : RISCVBinaryAAX;
diff --git a/llvm/lib/Target/RISCV/RISCVCleanupVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVCleanupVSETVLI.cpp
index 6a12f99b8903..ae32cbd1ae59 100644
--- a/llvm/lib/Target/RISCV/RISCVCleanupVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVCleanupVSETVLI.cpp
@@ -59,7 +59,8 @@ bool RISCVCleanupVSETVLI::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
   for (auto MII = MBB.begin(), MIE = MBB.end(); MII != MIE;) {
     MachineInstr &MI = *MII++;
 
-    if (MI.getOpcode() != RISCV::PseudoVSETVLI) {
+    if (MI.getOpcode() != RISCV::PseudoVSETVLI &&
+        MI.getOpcode() != RISCV::PseudoVSETIVLI) {
       if (PrevVSETVLI &&
           (MI.isCall() || MI.modifiesRegister(RISCV::VL) ||
            MI.modifiesRegister(RISCV::VTYPE))) {
@@ -69,26 +70,48 @@ bool RISCVCleanupVSETVLI::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
       continue;
     }
 
-    // If we don't have a previous VSETVLI or the VL output isn't dead, we
+    // If we don't have a previous VSET{I}VLI or the VL output isn't dead, we
     // can't remove this VSETVLI.
     if (!PrevVSETVLI || !MI.getOperand(0).isDead()) {
       PrevVSETVLI = &MI;
       continue;
     }
 
-    Register PrevAVLReg = PrevVSETVLI->getOperand(1).getReg();
-    Register AVLReg = MI.getOperand(1).getReg();
+    // If a previous "set vl" instruction opcode is different from this one, we
+    // can't differentiate the AVL values.
+    if (PrevVSETVLI->getOpcode() != MI.getOpcode()) {
+      PrevVSETVLI = &MI;
+      continue;
+    }
+
+    // The remaining two cases are
+    // 1. PrevVSETVLI = PseudoVSETVLI
+    //    MI = PseudoVSETVLI
+    //
+    // 2. PrevVSETVLI = PseudoVSETIVLI
+    //    MI = PseudoVSETIVLI
+    Register AVLReg;
+    bool SameAVL = false;
+    if (MI.getOpcode() == RISCV::PseudoVSETVLI) {
+      AVLReg = MI.getOperand(1).getReg();
+      SameAVL = PrevVSETVLI->getOperand(1).getReg() == AVLReg;
+    } else { // RISCV::PseudoVSETIVLI
+      SameAVL =
+          PrevVSETVLI->getOperand(1).getImm() == MI.getOperand(1).getImm();
+    }
     int64_t PrevVTYPEImm = PrevVSETVLI->getOperand(2).getImm();
     int64_t VTYPEImm = MI.getOperand(2).getImm();
 
-    // Does this VSETVLI use the same AVL register and VTYPE immediate?
-    if (PrevAVLReg != AVLReg || PrevVTYPEImm != VTYPEImm) {
+    // Does this VSET{I}VLI use the same AVL register/value and VTYPE immediate?
+    if (!SameAVL || PrevVTYPEImm != VTYPEImm) {
       PrevVSETVLI = &MI;
       continue;
     }
 
     // If the AVLReg is X0 we need to look at the output VL of both VSETVLIs.
-    if (AVLReg == RISCV::X0) {
+    if ((MI.getOpcode() == RISCV::PseudoVSETVLI) && (AVLReg == RISCV::X0)) {
+      assert((PrevVSETVLI->getOpcode() == RISCV::PseudoVSETVLI) &&
+             "Unexpected vsetvli opcode.");
       Register PrevOutVL = PrevVSETVLI->getOperand(0).getReg();
       Register OutVL = MI.getOperand(0).getReg();
       // We can't remove if the previous VSETVLI left VL unchanged and the
diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
index 5f50892ca886..ec9a39569952 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
@@ -103,6 +103,7 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB,
   case RISCV::PseudoLA_TLS_GD:
     return expandLoadTLSGDAddress(MBB, MBBI, NextMBBI);
   case RISCV::PseudoVSETVLI:
+  case RISCV::PseudoVSETIVLI:
     return expandVSetVL(MBB, MBBI);
   case RISCV::PseudoVMCLR_M_B1:
   case RISCV::PseudoVMCLR_M_B2:
@@ -217,9 +218,15 @@ bool RISCVExpandPseudo::expandVSetVL(MachineBasicBlock &MBB,
 
   DebugLoc DL = MBBI->getDebugLoc();
 
-  assert(MBBI->getOpcode() == RISCV::PseudoVSETVLI &&
+  assert((MBBI->getOpcode() == RISCV::PseudoVSETVLI ||
+          MBBI->getOpcode() == RISCV::PseudoVSETIVLI) &&
          "Unexpected pseudo instruction");
-  const MCInstrDesc &Desc = TII->get(RISCV::VSETVLI);
+  unsigned Opcode;
+  if (MBBI->getOpcode() == RISCV::PseudoVSETVLI)
+    Opcode = RISCV::VSETVLI;
+  else
+    Opcode = RISCV::VSETIVLI;
+  const MCInstrDesc &Desc = TII->get(Opcode);
   assert(Desc.getNumOperands() == 3 && "Unexpected instruction format");
 
   Register DstReg = MBBI->getOperand(0).getReg();
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 517e714cb59f..2121cc38f661 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -569,12 +569,14 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
 
       SDValue VLOperand = Node->getOperand(2);
       if (auto *C = dyn_cast<ConstantSDNode>(VLOperand)) {
-        if (C->isNullValue()) {
-          VLOperand = SDValue(
-              CurDAG->getMachineNode(RISCV::ADDI, DL, XLenVT,
-                                     CurDAG->getRegister(RISCV::X0, XLenVT),
-                                     CurDAG->getTargetConstant(0, DL, XLenVT)),
-              0);
+        uint64_t AVL = C->getZExtValue();
+        if (isUInt<5>(AVL)) {
+          SDValue VLImm = CurDAG->getTargetConstant(AVL, DL, XLenVT);
+          ReplaceNode(Node,
+                      CurDAG->getMachineNode(RISCV::PseudoVSETIVLI, DL, XLenVT,
+                                             MVT::Other, VLImm, VTypeIOp,
+                                             /* Chain */ Node->getOperand(0)));
+          return;
         }
       }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td b/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td
index 147993127e78..80f46b73bfd7 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td
@@ -38,9 +38,11 @@ class RISCVLSUMOP<bits<5> val> {
   bits<5> Value = val;
 }
 def LUMOPUnitStride  : RISCVLSUMOP<0b00000>;
+def LUMOPUnitStrideMask : RISCVLSUMOP<0b01011>;
 def LUMOPUnitStrideWholeReg : RISCVLSUMOP<0b01000>;
 def LUMOPUnitStrideFF: RISCVLSUMOP<0b10000>;
 def SUMOPUnitStride  : RISCVLSUMOP<0b00000>;
+def SUMOPUnitStrideMask : RISCVLSUMOP<0b01011>;
 def SUMOPUnitStrideWholeReg : RISCVLSUMOP<0b01000>;
 
 class RISCVAMOOP<bits<5> val> {
@@ -63,10 +65,23 @@ def LSWidth8     : RISCVWidth<0b0000>;
 def LSWidth16    : RISCVWidth<0b0101>;
 def LSWidth32    : RISCVWidth<0b0110>;
 def LSWidth64    : RISCVWidth<0b0111>;
-def LSWidth128   : RISCVWidth<0b1000>;
-def LSWidth256   : RISCVWidth<0b1101>;
-def LSWidth512   : RISCVWidth<0b1110>;
-def LSWidth1024  : RISCVWidth<0b1111>;
+
+class RVInstSetiVLi<dag outs, dag ins, string opcodestr, string argstr>
+    : RVInst<outs, ins, opcodestr, argstr, [], InstFormatI> {
+  bits<5> uimm;
+  bits<5> rd;
+  bits<10> vtypei;
+
+  let Inst{31} = 1;
+  let Inst{30} = 1;
+  let Inst{29-20} = vtypei{9-0};
+  let Inst{19-15} = uimm;
+  let Inst{14-12} = 0b111;
+  let Inst{11-7} = rd;
+  let Opcode = OPC_OP_V.Value;
+
+  let Defs = [VTYPE, VL];
+}
 
 class RVInstSetVLi<dag outs, dag ins, string opcodestr, string argstr>
     : RVInst<outs, ins, opcodestr, argstr, [], InstFormatI> {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
index e02c9f8bcbe2..86fbc73d81d5 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
@@ -82,6 +82,12 @@ def simm5_plus1 : Operand<XLenVT>, ImmLeaf<XLenVT,
 //===----------------------------------------------------------------------===//
 
 let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+// load vd, (rs1)
+class VUnitStrideLoadMask<string opcodestr>
+    : RVInstVLU<0b000, LSWidth8.Value{3}, LUMOPUnitStrideMask, LSWidth8.Value{2-0},
+                (outs VR:$vd),
+                (ins GPR:$rs1), opcodestr, "$vd, (${rs1})">;
+
 // load vd, (rs1), vm
 class VUnitStrideLoad<RISCVLSUMOP lumop, RISCVWidth width,
                       string opcodestr>
@@ -137,6 +143,12 @@ class VIndexedSegmentLoad<bits<3> nf, RISCVMOP mop, RISCVWidth width,
 } // hasSideEffects = 0, mayLoad = 1, mayStore = 0
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
+// store vd, vs3, (rs1), vm
+class VUnitStrideStoreMask<string opcodestr>
+    : RVInstVSU<0b000, LSWidth8.Value{3}, SUMOPUnitStrideMask, LSWidth8.Value{2-0},
+                (outs), (ins VR:$vs3, GPR:$rs1), opcodestr,
+                "$vs3, (${rs1})">;
+
 // store vd, vs3, (rs1), vm
 class VUnitStrideStore<RISCVLSUMOP sumop, RISCVWidth width,
                          string opcodestr>
@@ -423,10 +435,6 @@ multiclass VWholeLoad<bits<3> nf, string opcodestr> {
   def E16_V : VWholeLoad<nf, LSWidth16, opcodestr # "e16.v">;
   def E32_V : VWholeLoad<nf, LSWidth32, opcodestr # "e32.v">;
   def E64_V : VWholeLoad<nf, LSWidth64, opcodestr # "e64.v">;
-  def E128_V : VWholeLoad<nf, LSWidth128, opcodestr # "e128.v">;
-  def E256_V : VWholeLoad<nf, LSWidth256, opcodestr # "e256.v">;
-  def E512_V : VWholeLoad<nf, LSWidth512, opcodestr # "e512.v">;
-  def E1024_V : VWholeLoad<nf, LSWidth1024, opcodestr # "e1024.v">;
 }
 
 //===----------------------------------------------------------------------===//
@@ -438,6 +446,9 @@ let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in {
 def VSETVLI : RVInstSetVLi<(outs GPR:$rd), (ins GPR:$rs1, VTypeIOp:$vtypei),
                            "vsetvli", "$rd, $rs1, $vtypei">;
 
+def VSETIVLI : RVInstSetiVLi<(outs GPR:$rd), (ins uimm5:$uimm, VTypeIOp:$vtypei),
+                             "vsetivli", "$rd, $uimm, $vtypei">;
+
 def VSETVL : RVInstSetVL<(outs GPR:$rd), (ins GPR:$rs1, GPR:$rs2),
                          "vsetvl", "$rd, $rs1, $rs2">;
 } // hasSideEffects = 1, mayLoad = 0, mayStore = 0
@@ -447,47 +458,30 @@ def VLE8_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth8, "vle8.v">;
 def VLE16_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth16, "vle16.v">;
 def VLE32_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth32, "vle32.v">;
 def VLE64_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth64, "vle64.v">;
-def VLE128_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth128, "vle128.v">;
-def VLE256_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth256, "vle256.v">;
-def VLE512_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth512, "vle512.v">;
-def VLE1024_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth1024, "vle1024.v">;
 
 def VLE8FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth8, "vle8ff.v">;
 def VLE16FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth16, "vle16ff.v">;
 def VLE32FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth32, "vle32ff.v">;
 def VLE64FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth64, "vle64ff.v">;
-def VLE128FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth128, "vle128ff.v">;
-def VLE256FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth256, "vle256ff.v">;
-def VLE512FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth512, "vle512ff.v">;
-def VLE1024FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth1024, "vle1024ff.v">;
+
+def VLE1_V : VUnitStrideLoadMask<"vle1.v">;
+def VSE1_V : VUnitStrideStoreMask<"vse1.v">;
 
 def VSE8_V : VUnitStrideStore<SUMOPUnitStride, LSWidth8, "vse8.v">;
 def VSE16_V : VUnitStrideStore<SUMOPUnitStride, LSWidth16, "vse16.v">;
 def VSE32_V : VUnitStrideStore<SUMOPUnitStride, LSWidth32, "vse32.v">;
 def VSE64_V : VUnitStrideStore<SUMOPUnitStride, LSWidth64, "vse64.v">;
-def VSE128_V : VUnitStrideStore<SUMOPUnitStride, LSWidth128, "vse128.v">;
-def VSE256_V : VUnitStrideStore<SUMOPUnitStride, LSWidth256, "vse256.v">;
-def VSE512_V : VUnitStrideStore<SUMOPUnitStride, LSWidth512, "vse512.v">;
-def VSE1024_V : VUnitStrideStore<SUMOPUnitStride, LSWidth1024, "vse1024.v">;
 
 // Vector Strided Instructions
 def VLSE8_V : VStridedLoad<LSWidth8, "vlse8.v">;
 def VLSE16_V : VStridedLoad<LSWidth16, "vlse16.v">;
 def VLSE32_V : VStridedLoad<LSWidth32, "vlse32.v">;
 def VLSE64_V : VStridedLoad<LSWidth64, "vlse64.v">;
-def VLSE128_V : VStridedLoad<LSWidth128, "vlse128.v">;
-def VLSE256_V : VStridedLoad<LSWidth256, "vlse256.v">;
-def VLSE512_V : VStridedLoad<LSWidth512, "vlse512.v">;
-def VLSE1024_V : VStridedLoad<LSWidth1024, "vlse1024.v">;
 
 def VSSE8_V : VStridedStore<LSWidth8, "vsse8.v">;
 def VSSE16_V : VStridedStore<LSWidth16, "vsse16.v">;
 def VSSE32_V : VStridedStore<LSWidth32, "vsse32.v">;
 def VSSE64_V : VStridedStore<LSWidth64, "vsse64.v">;
-def VSSE128_V : VStridedStore<LSWidth128, "vsse128.v">;
-def VSSE256_V : VStridedStore<LSWidth256, "vsse256.v">;
-def VSSE512_V : VStridedStore<LSWidth512, "vsse512.v">;
-def VSSE1024_V : VStridedStore<LSWidth1024, "vsse1024.v">;
 
 // Vector Indexed Instructions
 def VLUXEI8_V : VIndexedLoad<MOPLDIndexedUnord, LSWidth8, "vluxei8.v">;
@@ -806,8 +800,8 @@ defm VFWNMSAC_V : VALUr_FV_V_F<"vfwnmsac", 0b111111>;
 
 // Vector Floating-Point Square-Root Instruction
 defm VFSQRT_V : VALU_FV_VS2<"vfsqrt.v", 0b010011, 0b00000>;
-defm VFRSQRTE7_V : VALU_FV_VS2<"vfrsqrte7.v", 0b010011, 0b00100>;
-defm VFRECE7_V : VALU_FV_VS2<"vfrece7.v", 0b010011, 0b00101>;
+defm VFRSQRT7_V : VALU_FV_VS2<"vfrsqrt7.v", 0b010011, 0b00100>;
+defm VFREC7_V : VALU_FV_VS2<"vfrec7.v", 0b010011, 0b00101>;
 
 // Vector Floating-Point MIN/MAX Instructions
 defm VFMIN_V : VALU_FV_V_F<"vfmin", 0b000100>;
@@ -1058,47 +1052,27 @@ let Predicates = [HasStdExtZvlsseg] in {
     def VLSEG#nf#E16_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStride, LSWidth16, "vlseg"#nf#"e16.v">;
     def VLSEG#nf#E32_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStride, LSWidth32, "vlseg"#nf#"e32.v">;
     def VLSEG#nf#E64_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStride, LSWidth64, "vlseg"#nf#"e64.v">;
-    def VLSEG#nf#E128_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStride, LSWidth128, "vlseg"#nf#"e128.v">;
-    def VLSEG#nf#E256_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStride, LSWidth256, "vlseg"#nf#"e256.v">;
-    def VLSEG#nf#E512_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStride, LSWidth512, "vlseg"#nf#"e512.v">;
-    def VLSEG#nf#E1024_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStride, LSWidth1024, "vlseg"#nf#"e1024.v">;
 
     def VLSEG#nf#E8FF_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStrideFF, LSWidth8, "vlseg"#nf#"e8ff.v">;
     def VLSEG#nf#E16FF_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStrideFF, LSWidth16, "vlseg"#nf#"e16ff.v">;
     def VLSEG#nf#E32FF_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStrideFF, LSWidth32, "vlseg"#nf#"e32ff.v">;
     def VLSEG#nf#E64FF_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStrideFF, LSWidth64, "vlseg"#nf#"e64ff.v">;
-    def VLSEG#nf#E128FF_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStrideFF, LSWidth128, "vlseg"#nf#"e128ff.v">;
-    def VLSEG#nf#E256FF_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStrideFF, LSWidth256, "vlseg"#nf#"e256ff.v">;
-    def VLSEG#nf#E512FF_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStrideFF, LSWidth512, "vlseg"#nf#"e512ff.v">;
-    def VLSEG#nf#E1024FF_V : VUnitStrideSegmentLoad<!add(nf, -1), LUMOPUnitStrideFF, LSWidth1024, "vlseg"#nf#"e1024ff.v">;
 
     def VSSEG#nf#E8_V : VUnitStrideSegmentStore<!add(nf, -1), LSWidth8, "vsseg"#nf#"e8.v">;
     def VSSEG#nf#E16_V : VUnitStrideSegmentStore<!add(nf, -1), LSWidth16, "vsseg"#nf#"e16.v">;
     def VSSEG#nf#E32_V : VUnitStrideSegmentStore<!add(nf, -1), LSWidth32, "vsseg"#nf#"e32.v">;
     def VSSEG#nf#E64_V : VUnitStrideSegmentStore<!add(nf, -1), LSWidth64, "vsseg"#nf#"e64.v">;
-    def VSSEG#nf#E128_V : VUnitStrideSegmentStore<!add(nf, -1), LSWidth128, "vsseg"#nf#"e128.v">;
-    def VSSEG#nf#E256_V : VUnitStrideSegmentStore<!add(nf, -1), LSWidth256, "vsseg"#nf#"e256.v">;
-    def VSSEG#nf#E512_V : VUnitStrideSegmentStore<!add(nf, -1), LSWidth512, "vsseg"#nf#"e512.v">;
-    def VSSEG#nf#E1024_V : VUnitStrideSegmentStore<!add(nf, -1), LSWidth1024, "vsseg"#nf#"e1024.v">;
 
     // Vector Strided Instructions
     def VLSSEG#nf#E8_V : VStridedSegmentLoad<!add(nf, -1), LSWidth8, "vlsseg"#nf#"e8.v">;
     def VLSSEG#nf#E16_V : VStridedSegmentLoad<!add(nf, -1), LSWidth16, "vlsseg"#nf#"e16.v">;
     def VLSSEG#nf#E32_V : VStridedSegmentLoad<!add(nf, -1), LSWidth32, "vlsseg"#nf#"e32.v">;
     def VLSSEG#nf#E64_V : VStridedSegmentLoad<!add(nf, -1), LSWidth64, "vlsseg"#nf#"e64.v">;
-    def VLSSEG#nf#E128_V : VStridedSegmentLoad<!add(nf, -1), LSWidth128, "vlsseg"#nf#"e128.v">;
-    def VLSSEG#nf#E256_V : VStridedSegmentLoad<!add(nf, -1), LSWidth256, "vlsseg"#nf#"e256.v">;
-    def VLSSEG#nf#E512_V : VStridedSegmentLoad<!add(nf, -1), LSWidth512, "vlsseg"#nf#"e512.v">;
-    def VLSSEG#nf#E1024_V : VStridedSegmentLoad<!add(nf, -1), LSWidth1024, "vlsseg"#nf#"e1024.v">;
 
     def VSSSEG#nf#E8_V : VStridedSegmentStore<!add(nf, -1), LSWidth8, "vssseg"#nf#"e8.v">;
     def VSSSEG#nf#E16_V : VStridedSegmentStore<!add(nf, -1), LSWidth16, "vssseg"#nf#"e16.v">;
     def VSSSEG#nf#E32_V : VStridedSegmentStore<!add(nf, -1), LSWidth32, "vssseg"#nf#"e32.v">;
     def VSSSEG#nf#E64_V : VStridedSegmentStore<!add(nf, -1), LSWidth64, "vssseg"#nf#"e64.v">;
-    def VSSSEG#nf#E128_V : VStridedSegmentStore<!add(nf, -1), LSWidth128, "vssseg"#nf#"e128.v">;
-    def VSSSEG#nf#E256_V : VStridedSegmentStore<!add(nf, -1), LSWidth256, "vssseg"#nf#"e256.v">;
-    def VSSSEG#nf#E512_V : VStridedSegmentStore<!add(nf, -1), LSWidth512, "vssseg"#nf#"e512.v">;
-    def VSSSEG#nf#E1024_V : VStridedSegmentStore<!add(nf, -1), LSWidth1024, "vssseg"#nf#"e1024.v">;
 
     // Vector Indexed Instructions
     def VLUXSEG#nf#EI8_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedUnord,
@@ -1109,14 +1083,6 @@ let Predicates = [HasStdExtZvlsseg] in {
                               LSWidth32, "vluxseg"#nf#"ei32.v">;
     def VLUXSEG#nf#EI64_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedUnord,
                               LSWidth64, "vluxseg"#nf#"ei64.v">;
-    def VLUXSEG#nf#EI128_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedUnord,
-                               LSWidth128, "vluxseg"#nf#"ei128.v">;
-    def VLUXSEG#nf#EI256_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedUnord,
-                               LSWidth256, "vluxseg"#nf#"ei256.v">;
-    def VLUXSEG#nf#EI512_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedUnord,
-                               LSWidth512, "vluxseg"#nf#"ei512.v">;
-    def VLUXSEG#nf#EI1024_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedUnord,
-                               LSWidth1024, "vluxseg"#nf#"ei1024.v">;
 
     def VLOXSEG#nf#EI8_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedOrder,
                              LSWidth8, "vloxseg"#nf#"ei8.v">;
@@ -1126,14 +1092,6 @@ let Predicates = [HasStdExtZvlsseg] in {
                               LSWidth32, "vloxseg"#nf#"ei32.v">;
     def VLOXSEG#nf#EI64_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedOrder,
                               LSWidth64, "vloxseg"#nf#"ei64.v">;
-    def VLOXSEG#nf#EI128_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedOrder,
-                               LSWidth128, "vloxseg"#nf#"ei128.v">;
-    def VLOXSEG#nf#EI256_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedOrder,
-                               LSWidth256, "vloxseg"#nf#"ei256.v">;
-    def VLOXSEG#nf#EI512_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedOrder,
-                               LSWidth512, "vloxseg"#nf#"ei512.v">;
-    def VLOXSEG#nf#EI1024_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedOrder,
-                               LSWidth1024, "vloxseg"#nf#"ei1024.v">;
 
     def VSUXSEG#nf#EI8_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedUnord,
                              LSWidth8, "vsuxseg"#nf#"ei8.v">;
@@ -1143,14 +1101,6 @@ let Predicates = [HasStdExtZvlsseg] in {
                               LSWidth32, "vsuxseg"#nf#"ei32.v">;
     def VSUXSEG#nf#EI64_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedUnord,
                               LSWidth64, "vsuxseg"#nf#"ei64.v">;
-    def VSUXSEG#nf#EI128_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedUnord,
-                               LSWidth128, "vsuxseg"#nf#"ei128.v">;
-    def VSUXSEG#nf#EI256_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedUnord,
-                               LSWidth256, "vsuxseg"#nf#"ei256.v">;
-    def VSUXSEG#nf#EI512_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedUnord,
-                               LSWidth512, "vsuxseg"#nf#"ei512.v">;
-    def VSUXSEG#nf#EI1024_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedUnord,
-                                LSWidth1024, "vsuxseg"#nf#"ei1024.v">;
 
     def VSOXSEG#nf#EI8_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedOrder,
                              LSWidth8, "vsoxseg"#nf#"ei8.v">;
@@ -1160,14 +1110,6 @@ let Predicates = [HasStdExtZvlsseg] in {
                               LSWidth32, "vsoxseg"#nf#"ei32.v">;
     def VSOXSEG#nf#EI64_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedOrder,
                               LSWidth64, "vsoxseg"#nf#"ei64.v">;
-    def VSOXSEG#nf#EI128_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedOrder,
-                               LSWidth128, "vsoxseg"#nf#"ei128.v">;
-    def VSOXSEG#nf#EI256_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedOrder,
-                               LSWidth256, "vsoxseg"#nf#"ei256.v">;
-    def VSOXSEG#nf#EI512_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedOrder,
-                               LSWidth512, "vsoxseg"#nf#"ei512.v">;
-    def VSOXSEG#nf#EI1024_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedOrder,
-                                LSWidth1024, "vsoxseg"#nf#"ei1024.v">;
   }
 } // Predicates = [HasStdExtZvlsseg]
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index fa17f2d87eff..60bd1b24cab8 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -1218,6 +1218,14 @@ multiclass VPseudoUSLoad {
   }
 }
 
+multiclass VPseudoLoadMask {
+  foreach mti = AllMasks in {
+    let VLMul = mti.LMul.value in {
+      def "_V_" # mti.BX : VPseudoUSLoadNoMask<VR>;
+    }
+  }
+}
+
 multiclass VPseudoSLoad {
   foreach lmul = MxList.m in {
     defvar LInfo = lmul.MX;
@@ -1254,6 +1262,14 @@ multiclass VPseudoUSStore {
   }
 }
 
+multiclass VPseudoStoreMask {
+  foreach mti = AllMasks in {
+    let VLMul = mti.LMul.value in {
+      def "_V_" # mti.BX : VPseudoUSStoreNoMask<VR>;
+    }
+  }
+}
+
 multiclass VPseudoSStore {
   foreach lmul = MxList.m in {
     defvar LInfo = lmul.MX;
@@ -3115,7 +3131,7 @@ def PseudoReadVL : Pseudo<(outs GPR:$rd), (ins),
 // Pseudos.
 let hasSideEffects = 1, mayLoad = 0, mayStore = 0, Defs = [VL, VTYPE] in {
 def PseudoVSETVLI : Pseudo<(outs GPR:$rd), (ins GPR:$rs1, VTypeIOp:$vtypei), []>;
-
+def PseudoVSETIVLI : Pseudo<(outs GPR:$rd), (ins uimm5:$rs1, VTypeIOp:$vtypei), []>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -3132,6 +3148,9 @@ foreach eew = EEWList in {
   defm PseudoVSE # eew : VPseudoUSStore;
 }
 
+defm PseudoVLE1 : VPseudoLoadMask;
+defm PseudoVSE1 : VPseudoStoreMask;
+
 //===----------------------------------------------------------------------===//
 // 7.5 Vector Strided Instructions
 //===----------------------------------------------------------------------===//
@@ -3427,12 +3446,12 @@ defm PseudoVFSQRT      : VPseudoUnaryV_V;
 //===----------------------------------------------------------------------===//
 // 14.9. Vector Floating-Point Reciprocal Square-Root Estimate Instruction
 //===----------------------------------------------------------------------===//
-defm PseudoVFRSQRTE7   : VPseudoUnaryV_V;
+defm PseudoVFRSQRT7    : VPseudoUnaryV_V;
 
 //===----------------------------------------------------------------------===//
 // 14.10. Vector Floating-Point Reciprocal Estimate Instruction
 //===----------------------------------------------------------------------===//
-defm PseudoVFRECE7     : VPseudoUnaryV_V;
+defm PseudoVFREC7      : VPseudoUnaryV_V;
 
 //===----------------------------------------------------------------------===//
 // 14.11. Vector Floating-Point Min/Max Instructions
@@ -3709,6 +3728,15 @@ foreach vti = AllVectors in
                      vti.Vector, vti.Mask, vti.SEW, vti.LMul, vti.RegClass>;
 }
 
+foreach vti = AllMasks in {
+  defvar PseudoVLE1 = !cast<Instruction>("PseudoVLE1_V_"#vti.BX);
+  def : Pat<(vti.Mask (int_riscv_vle1 GPR:$rs1, (XLenVT (VLOp GPR:$vl)))),
+            (PseudoVLE1 $rs1, GPR:$vl, vti.SEW)>;
+  defvar PseudoVSE1 = !cast<Instruction>("PseudoVSE1_V_"#vti.BX);
+  def : Pat<(int_riscv_vse1 (vti.Mask VR:$rs3), GPR:$rs1, (XLenVT (VLOp GPR:$vl))),
+            (PseudoVSE1 $rs3, $rs1, GPR:$vl, vti.SEW)>;
+}
+
 //===----------------------------------------------------------------------===//
 // 7.5 Vector Strided Instructions
 //===----------------------------------------------------------------------===//
@@ -4100,12 +4128,12 @@ defm "" : VPatUnaryV_V<"int_riscv_vfsqrt", "PseudoVFSQRT", AllFloatVectors>;
 //===----------------------------------------------------------------------===//
 // 14.9. Vector Floating-Point Reciprocal Square-Root Estimate Instruction
 //===----------------------------------------------------------------------===//
-defm "" : VPatUnaryV_V<"int_riscv_vfrsqrte7", "PseudoVFRSQRTE7", AllFloatVectors>;
+defm "" : VPatUnaryV_V<"int_riscv_vfrsqrt7", "PseudoVFRSQRT7", AllFloatVectors>;
 
 //===----------------------------------------------------------------------===//
 // 14.10. Vector Floating-Point Reciprocal Estimate Instruction
 //===----------------------------------------------------------------------===//
-defm "" : VPatUnaryV_V<"int_riscv_vfrece7", "PseudoVFRECE7", AllFloatVectors>;
+defm "" : VPatUnaryV_V<"int_riscv_vfrec7", "PseudoVFREC7", AllFloatVectors>;
 
 //===----------------------------------------------------------------------===//
 // 14.11. Vector Floating-Point Min/Max Instructions
diff --git a/llvm/test/CodeGen/RISCV/rvv/cleanup-vsetivli.mir b/llvm/test/CodeGen/RISCV/rvv/cleanup-vsetivli.mir
new file mode 100644
index 000000000000..ed8bc5698062
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/cleanup-vsetivli.mir
@@ -0,0 +1,46 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc %s -mtriple=riscv64 -run-pass=riscv-cleanup-vsetvli -o - | FileCheck %s
+
+# Make sure we don't combine these VSET{I}VLIs in the cleanup pass. We could not
+# differentiate AVL values if the opcode of the previous one is different from
+# current one.
+
+--- |
+  ; ModuleID = '../llvm/test/CodeGen/RISCV/rvv/add-vsetivli.ll'
+  source_filename = "../llvm/test/CodeGen/RISCV/rvv/add-vsetivli.ll"
+  target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
+  target triple = "riscv64"
+
+  define void @cleanup_vsetivli() #0 {
+    ret void
+  }
+
+  attributes #0 = { "target-features"="+experimental-v" }
+
+...
+---
+name:            cleanup_vsetivli
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gpr }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0 (%ir-block.0):
+    ; CHECK-LABEL: name: cleanup_vsetivli
+    ; CHECK: dead %0:gpr = PseudoVSETVLI $x0, 12, implicit-def $vl, implicit-def $vtype
+    ; CHECK: dead %1:gpr = PseudoVSETIVLI 5, 12, implicit-def $vl, implicit-def $vtype
+    ; CHECK: dead %3:gpr = PseudoVSETVLI $x0, 12, implicit-def $vl, implicit-def $vtype
+    ; CHECK: dead %5:gpr = PseudoVSETIVLI 5, 12, implicit-def $vl, implicit-def $vtype
+    ; CHECK: PseudoRET
+    dead %0:gpr  = PseudoVSETVLI $x0, 12, implicit-def $vl, implicit-def $vtype
+    dead %1:gpr  = PseudoVSETIVLI 5, 12, implicit-def $vl, implicit-def $vtype
+    dead %2:gpr  = PseudoVSETIVLI 5, 12, implicit-def $vl, implicit-def $vtype
+    dead %3:gpr  = PseudoVSETVLI $x0, 12, implicit-def $vl, implicit-def $vtype
+    dead %4:gpr  = PseudoVSETVLI $x0, 12, implicit-def $vl, implicit-def $vtype
+    dead %5:gpr  = PseudoVSETIVLI 5, 12, implicit-def $vl, implicit-def $vtype
+    PseudoRET
+
+...
diff --git a/llvm/test/CodeGen/RISCV/rvv/rv32-vsetvli-intrinsics.ll b/llvm/test/CodeGen/RISCV/rvv/rv32-vsetvli-intrinsics.ll
index d724d0df9692..5e97df06470c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rv32-vsetvli-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rv32-vsetvli-intrinsics.ll
@@ -16,8 +16,7 @@ define void @test_vsetvli_e64mf8(i32 %avl) nounwind {
 define void @test_vsetvli_e8mf2_zero_avl() nounwind {
 ; CHECK-LABEL: test_vsetvli_e8mf2_zero_avl:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    mv a0, zero
-; CHECK-NEXT:    vsetvli a0, a0, e8,mf2,ta,mu
+; CHECK-NEXT:    vsetivli a0, 0, e8,mf2,ta,mu
 ; CHECK-NEXT:    ret
   call i32 @llvm.riscv.vsetvli.i32(i32 0, i32 0, i32 7)
   ret void
diff --git a/llvm/test/CodeGen/RISCV/rvv/rv64-vsetvli-intrinsics.ll b/llvm/test/CodeGen/RISCV/rvv/rv64-vsetvli-intrinsics.ll
index c28058c16efb..78d1008ce28b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rv64-vsetvli-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rv64-vsetvli-intrinsics.ll
@@ -25,8 +25,7 @@ define void @test_vsetvli_e16mf4(i64 %avl) nounwind {
 define void @test_vsetvli_e32mf8_zero_avl() nounwind {
 ; CHECK-LABEL: test_vsetvli_e32mf8_zero_avl:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    mv a0, zero
-; CHECK-NEXT:    vsetvli a0, a0, e16,mf4,ta,mu
+; CHECK-NEXT:    vsetivli a0, 0, e16,mf4,ta,mu
 ; CHECK-NEXT:    ret
   call i64 @llvm.riscv.vsetvli.i64(i64 0, i64 1, i64 6)
   ret void
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfrec7-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vfrec7-rv32.ll
new file mode 100644
index 000000000000..244903fadb32
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfrec7-rv32.ll
@@ -0,0 +1,602 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-v,+d,+experimental-zfh -verify-machineinstrs \
+; RUN:   --riscv-no-aliases < %s | FileCheck %s
+declare <vscale x 1 x half> @llvm.riscv.vfrec7.nxv1f16(
+  <vscale x 1 x half>,
+  i32);
+
+define <vscale x 1 x half> @intrinsic_vfrec7_v_nxv1f16_nxv1f16(<vscale x 1 x half> %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv1f16_nxv1f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,mf4,ta,mu
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vfrec7.nxv1f16(
+    <vscale x 1 x half> %0,
+    i32 %1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vfrec7.mask.nxv1f16(
+  <vscale x 1 x half>,
+  <vscale x 1 x half>,
+  <vscale x 1 x i1>,
+  i32);
+
+define <vscale x 1 x half> @intrinsic_vfrec7_mask_v_nxv1f16_nxv1f16(<vscale x 1 x i1> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, i32 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv1f16_nxv1f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,mf4,tu,mu
+; CHECK-NEXT:    vfrec7.v v8, v9, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vfrec7.mask.nxv1f16(
+    <vscale x 1 x half> %1,
+    <vscale x 1 x half> %2,
+    <vscale x 1 x i1> %0,
+    i32 %3)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vfrec7.nxv2f16(
+  <vscale x 2 x half>,
+  i32);
+
+define <vscale x 2 x half> @intrinsic_vfrec7_v_nxv2f16_nxv2f16(<vscale x 2 x half> %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv2f16_nxv2f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,mf2,ta,mu
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vfrec7.nxv2f16(
+    <vscale x 2 x half> %0,
+    i32 %1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vfrec7.mask.nxv2f16(
+  <vscale x 2 x half>,
+  <vscale x 2 x half>,
+  <vscale x 2 x i1>,
+  i32);
+
+define <vscale x 2 x half> @intrinsic_vfrec7_mask_v_nxv2f16_nxv2f16(<vscale x 2 x i1> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, i32 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv2f16_nxv2f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,mf2,tu,mu
+; CHECK-NEXT:    vfrec7.v v8, v9, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vfrec7.mask.nxv2f16(
+    <vscale x 2 x half> %1,
+    <vscale x 2 x half> %2,
+    <vscale x 2 x i1> %0,
+    i32 %3)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vfrec7.nxv4f16(
+  <vscale x 4 x half>,
+  i32);
+
+define <vscale x 4 x half> @intrinsic_vfrec7_v_nxv4f16_nxv4f16(<vscale x 4 x half> %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv4f16_nxv4f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,m1,ta,mu
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vfrec7.nxv4f16(
+    <vscale x 4 x half> %0,
+    i32 %1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vfrec7.mask.nxv4f16(
+  <vscale x 4 x half>,
+  <vscale x 4 x half>,
+  <vscale x 4 x i1>,
+  i32);
+
+define <vscale x 4 x half> @intrinsic_vfrec7_mask_v_nxv4f16_nxv4f16(<vscale x 4 x i1> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, i32 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv4f16_nxv4f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,m1,tu,mu
+; CHECK-NEXT:    vfrec7.v v8, v9, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vfrec7.mask.nxv4f16(
+    <vscale x 4 x half> %1,
+    <vscale x 4 x half> %2,
+    <vscale x 4 x i1> %0,
+    i32 %3)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vfrec7.nxv8f16(
+  <vscale x 8 x half>,
+  i32);
+
+define <vscale x 8 x half> @intrinsic_vfrec7_v_nxv8f16_nxv8f16(<vscale x 8 x half> %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv8f16_nxv8f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,m2,ta,mu
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vfrec7.nxv8f16(
+    <vscale x 8 x half> %0,
+    i32 %1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vfrec7.mask.nxv8f16(
+  <vscale x 8 x half>,
+  <vscale x 8 x half>,
+  <vscale x 8 x i1>,
+  i32);
+
+define <vscale x 8 x half> @intrinsic_vfrec7_mask_v_nxv8f16_nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, i32 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv8f16_nxv8f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,m2,tu,mu
+; CHECK-NEXT:    vfrec7.v v8, v10, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vfrec7.mask.nxv8f16(
+    <vscale x 8 x half> %1,
+    <vscale x 8 x half> %2,
+    <vscale x 8 x i1> %0,
+    i32 %3)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vfrec7.nxv16f16(
+  <vscale x 16 x half>,
+  i32);
+
+define <vscale x 16 x half> @intrinsic_vfrec7_v_nxv16f16_nxv16f16(<vscale x 16 x half> %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv16f16_nxv16f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,m4,ta,mu
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vfrec7.nxv16f16(
+    <vscale x 16 x half> %0,
+    i32 %1)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vfrec7.mask.nxv16f16(
+  <vscale x 16 x half>,
+  <vscale x 16 x half>,
+  <vscale x 16 x i1>,
+  i32);
+
+define <vscale x 16 x half> @intrinsic_vfrec7_mask_v_nxv16f16_nxv16f16(<vscale x 16 x i1> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, i32 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv16f16_nxv16f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,m4,tu,mu
+; CHECK-NEXT:    vfrec7.v v8, v12, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vfrec7.mask.nxv16f16(
+    <vscale x 16 x half> %1,
+    <vscale x 16 x half> %2,
+    <vscale x 16 x i1> %0,
+    i32 %3)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vfrec7.nxv32f16(
+  <vscale x 32 x half>,
+  i32);
+
+define <vscale x 32 x half> @intrinsic_vfrec7_v_nxv32f16_nxv32f16(<vscale x 32 x half> %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv32f16_nxv32f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,m8,ta,mu
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vfrec7.nxv32f16(
+    <vscale x 32 x half> %0,
+    i32 %1)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vfrec7.mask.nxv32f16(
+  <vscale x 32 x half>,
+  <vscale x 32 x half>,
+  <vscale x 32 x i1>,
+  i32);
+
+define <vscale x 32 x half> @intrinsic_vfrec7_mask_v_nxv32f16_nxv32f16(<vscale x 32 x i1> %0, <vscale x 32 x half> %1, <vscale x 32 x half> %2, i32 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv32f16_nxv32f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,m8,tu,mu
+; CHECK-NEXT:    vfrec7.v v8, v16, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vfrec7.mask.nxv32f16(
+    <vscale x 32 x half> %1,
+    <vscale x 32 x half> %2,
+    <vscale x 32 x i1> %0,
+    i32 %3)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfrec7.nxv1f32(
+  <vscale x 1 x float>,
+  i32);
+
+define <vscale x 1 x float> @intrinsic_vfrec7_v_nxv1f32_nxv1f32(<vscale x 1 x float> %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv1f32_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,mf2,ta,mu
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfrec7.nxv1f32(
+    <vscale x 1 x float> %0,
+    i32 %1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfrec7.mask.nxv1f32(
+  <vscale x 1 x float>,
+  <vscale x 1 x float>,
+  <vscale x 1 x i1>,
+  i32);
+
+define <vscale x 1 x float> @intrinsic_vfrec7_mask_v_nxv1f32_nxv1f32(<vscale x 1 x i1> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, i32 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv1f32_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,mf2,tu,mu
+; CHECK-NEXT:    vfrec7.v v8, v9, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfrec7.mask.nxv1f32(
+    <vscale x 1 x float> %1,
+    <vscale x 1 x float> %2,
+    <vscale x 1 x i1> %0,
+    i32 %3)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfrec7.nxv2f32(
+  <vscale x 2 x float>,
+  i32);
+
+define <vscale x 2 x float> @intrinsic_vfrec7_v_nxv2f32_nxv2f32(<vscale x 2 x float> %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv2f32_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,m1,ta,mu
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfrec7.nxv2f32(
+    <vscale x 2 x float> %0,
+    i32 %1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfrec7.mask.nxv2f32(
+  <vscale x 2 x float>,
+  <vscale x 2 x float>,
+  <vscale x 2 x i1>,
+  i32);
+
+define <vscale x 2 x float> @intrinsic_vfrec7_mask_v_nxv2f32_nxv2f32(<vscale x 2 x i1> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, i32 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv2f32_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,m1,tu,mu
+; CHECK-NEXT:    vfrec7.v v8, v9, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfrec7.mask.nxv2f32(
+    <vscale x 2 x float> %1,
+    <vscale x 2 x float> %2,
+    <vscale x 2 x i1> %0,
+    i32 %3)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfrec7.nxv4f32(
+  <vscale x 4 x float>,
+  i32);
+
+define <vscale x 4 x float> @intrinsic_vfrec7_v_nxv4f32_nxv4f32(<vscale x 4 x float> %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv4f32_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,m2,ta,mu
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfrec7.nxv4f32(
+    <vscale x 4 x float> %0,
+    i32 %1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfrec7.mask.nxv4f32(
+  <vscale x 4 x float>,
+  <vscale x 4 x float>,
+  <vscale x 4 x i1>,
+  i32);
+
+define <vscale x 4 x float> @intrinsic_vfrec7_mask_v_nxv4f32_nxv4f32(<vscale x 4 x i1> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, i32 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv4f32_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,m2,tu,mu
+; CHECK-NEXT:    vfrec7.v v8, v10, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfrec7.mask.nxv4f32(
+    <vscale x 4 x float> %1,
+    <vscale x 4 x float> %2,
+    <vscale x 4 x i1> %0,
+    i32 %3)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfrec7.nxv8f32(
+  <vscale x 8 x float>,
+  i32);
+
+define <vscale x 8 x float> @intrinsic_vfrec7_v_nxv8f32_nxv8f32(<vscale x 8 x float> %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv8f32_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,m4,ta,mu
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfrec7.nxv8f32(
+    <vscale x 8 x float> %0,
+    i32 %1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfrec7.mask.nxv8f32(
+  <vscale x 8 x float>,
+  <vscale x 8 x float>,
+  <vscale x 8 x i1>,
+  i32);
+
+define <vscale x 8 x float> @intrinsic_vfrec7_mask_v_nxv8f32_nxv8f32(<vscale x 8 x i1> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, i32 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv8f32_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,m4,tu,mu
+; CHECK-NEXT:    vfrec7.v v8, v12, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfrec7.mask.nxv8f32(
+    <vscale x 8 x float> %1,
+    <vscale x 8 x float> %2,
+    <vscale x 8 x i1> %0,
+    i32 %3)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfrec7.nxv16f32(
+  <vscale x 16 x float>,
+  i32);
+
+define <vscale x 16 x float> @intrinsic_vfrec7_v_nxv16f32_nxv16f32(<vscale x 16 x float> %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv16f32_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,m8,ta,mu
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfrec7.nxv16f32(
+    <vscale x 16 x float> %0,
+    i32 %1)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfrec7.mask.nxv16f32(
+  <vscale x 16 x float>,
+  <vscale x 16 x float>,
+  <vscale x 16 x i1>,
+  i32);
+
+define <vscale x 16 x float> @intrinsic_vfrec7_mask_v_nxv16f32_nxv16f32(<vscale x 16 x i1> %0, <vscale x 16 x float> %1, <vscale x 16 x float> %2, i32 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv16f32_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,m8,tu,mu
+; CHECK-NEXT:    vfrec7.v v8, v16, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfrec7.mask.nxv16f32(
+    <vscale x 16 x float> %1,
+    <vscale x 16 x float> %2,
+    <vscale x 16 x i1> %0,
+    i32 %3)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vfrec7.nxv1f64(
+  <vscale x 1 x double>,
+  i32);
+
+define <vscale x 1 x double> @intrinsic_vfrec7_v_nxv1f64_nxv1f64(<vscale x 1 x double> %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv1f64_nxv1f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e64,m1,ta,mu
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vfrec7.nxv1f64(
+    <vscale x 1 x double> %0,
+    i32 %1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vfrec7.mask.nxv1f64(
+  <vscale x 1 x double>,
+  <vscale x 1 x double>,
+  <vscale x 1 x i1>,
+  i32);
+
+define <vscale x 1 x double> @intrinsic_vfrec7_mask_v_nxv1f64_nxv1f64(<vscale x 1 x i1> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, i32 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv1f64_nxv1f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e64,m1,tu,mu
+; CHECK-NEXT:    vfrec7.v v8, v9, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vfrec7.mask.nxv1f64(
+    <vscale x 1 x double> %1,
+    <vscale x 1 x double> %2,
+    <vscale x 1 x i1> %0,
+    i32 %3)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vfrec7.nxv2f64(
+  <vscale x 2 x double>,
+  i32);
+
+define <vscale x 2 x double> @intrinsic_vfrec7_v_nxv2f64_nxv2f64(<vscale x 2 x double> %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv2f64_nxv2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e64,m2,ta,mu
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vfrec7.nxv2f64(
+    <vscale x 2 x double> %0,
+    i32 %1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vfrec7.mask.nxv2f64(
+  <vscale x 2 x double>,
+  <vscale x 2 x double>,
+  <vscale x 2 x i1>,
+  i32);
+
+define <vscale x 2 x double> @intrinsic_vfrec7_mask_v_nxv2f64_nxv2f64(<vscale x 2 x i1> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, i32 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv2f64_nxv2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e64,m2,tu,mu
+; CHECK-NEXT:    vfrec7.v v8, v10, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vfrec7.mask.nxv2f64(
+    <vscale x 2 x double> %1,
+    <vscale x 2 x double> %2,
+    <vscale x 2 x i1> %0,
+    i32 %3)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vfrec7.nxv4f64(
+  <vscale x 4 x double>,
+  i32);
+
+define <vscale x 4 x double> @intrinsic_vfrec7_v_nxv4f64_nxv4f64(<vscale x 4 x double> %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv4f64_nxv4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e64,m4,ta,mu
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vfrec7.nxv4f64(
+    <vscale x 4 x double> %0,
+    i32 %1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vfrec7.mask.nxv4f64(
+  <vscale x 4 x double>,
+  <vscale x 4 x double>,
+  <vscale x 4 x i1>,
+  i32);
+
+define <vscale x 4 x double> @intrinsic_vfrec7_mask_v_nxv4f64_nxv4f64(<vscale x 4 x i1> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, i32 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv4f64_nxv4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e64,m4,tu,mu
+; CHECK-NEXT:    vfrec7.v v8, v12, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vfrec7.mask.nxv4f64(
+    <vscale x 4 x double> %1,
+    <vscale x 4 x double> %2,
+    <vscale x 4 x i1> %0,
+    i32 %3)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vfrec7.nxv8f64(
+  <vscale x 8 x double>,
+  i32);
+
+define <vscale x 8 x double> @intrinsic_vfrec7_v_nxv8f64_nxv8f64(<vscale x 8 x double> %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv8f64_nxv8f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e64,m8,ta,mu
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vfrec7.nxv8f64(
+    <vscale x 8 x double> %0,
+    i32 %1)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vfrec7.mask.nxv8f64(
+  <vscale x 8 x double>,
+  <vscale x 8 x double>,
+  <vscale x 8 x i1>,
+  i32);
+
+define <vscale x 8 x double> @intrinsic_vfrec7_mask_v_nxv8f64_nxv8f64(<vscale x 8 x i1> %0, <vscale x 8 x double> %1, <vscale x 8 x double> %2, i32 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv8f64_nxv8f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e64,m8,tu,mu
+; CHECK-NEXT:    vfrec7.v v8, v16, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vfrec7.mask.nxv8f64(
+    <vscale x 8 x double> %1,
+    <vscale x 8 x double> %2,
+    <vscale x 8 x i1> %0,
+    i32 %3)
+
+  ret <vscale x 8 x double> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfrec7-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vfrec7-rv64.ll
new file mode 100644
index 000000000000..7b24fb9b0238
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfrec7-rv64.ll
@@ -0,0 +1,602 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-v,+d,+experimental-zfh -verify-machineinstrs \
+; RUN:   --riscv-no-aliases < %s | FileCheck %s
+declare <vscale x 1 x half> @llvm.riscv.vfrec7.nxv1f16(
+  <vscale x 1 x half>,
+  i64);
+
+define <vscale x 1 x half> @intrinsic_vfrec7_v_nxv1f16_nxv1f16(<vscale x 1 x half> %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv1f16_nxv1f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,mf4,ta,mu
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vfrec7.nxv1f16(
+    <vscale x 1 x half> %0,
+    i64 %1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vfrec7.mask.nxv1f16(
+  <vscale x 1 x half>,
+  <vscale x 1 x half>,
+  <vscale x 1 x i1>,
+  i64);
+
+define <vscale x 1 x half> @intrinsic_vfrec7_mask_v_nxv1f16_nxv1f16(<vscale x 1 x i1> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv1f16_nxv1f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,mf4,tu,mu
+; CHECK-NEXT:    vfrec7.v v8, v9, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vfrec7.mask.nxv1f16(
+    <vscale x 1 x half> %1,
+    <vscale x 1 x half> %2,
+    <vscale x 1 x i1> %0,
+    i64 %3)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vfrec7.nxv2f16(
+  <vscale x 2 x half>,
+  i64);
+
+define <vscale x 2 x half> @intrinsic_vfrec7_v_nxv2f16_nxv2f16(<vscale x 2 x half> %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv2f16_nxv2f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,mf2,ta,mu
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vfrec7.nxv2f16(
+    <vscale x 2 x half> %0,
+    i64 %1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vfrec7.mask.nxv2f16(
+  <vscale x 2 x half>,
+  <vscale x 2 x half>,
+  <vscale x 2 x i1>,
+  i64);
+
+define <vscale x 2 x half> @intrinsic_vfrec7_mask_v_nxv2f16_nxv2f16(<vscale x 2 x i1> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv2f16_nxv2f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,mf2,tu,mu
+; CHECK-NEXT:    vfrec7.v v8, v9, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vfrec7.mask.nxv2f16(
+    <vscale x 2 x half> %1,
+    <vscale x 2 x half> %2,
+    <vscale x 2 x i1> %0,
+    i64 %3)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vfrec7.nxv4f16(
+  <vscale x 4 x half>,
+  i64);
+
+define <vscale x 4 x half> @intrinsic_vfrec7_v_nxv4f16_nxv4f16(<vscale x 4 x half> %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv4f16_nxv4f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,m1,ta,mu
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vfrec7.nxv4f16(
+    <vscale x 4 x half> %0,
+    i64 %1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vfrec7.mask.nxv4f16(
+  <vscale x 4 x half>,
+  <vscale x 4 x half>,
+  <vscale x 4 x i1>,
+  i64);
+
+define <vscale x 4 x half> @intrinsic_vfrec7_mask_v_nxv4f16_nxv4f16(<vscale x 4 x i1> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv4f16_nxv4f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,m1,tu,mu
+; CHECK-NEXT:    vfrec7.v v8, v9, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vfrec7.mask.nxv4f16(
+    <vscale x 4 x half> %1,
+    <vscale x 4 x half> %2,
+    <vscale x 4 x i1> %0,
+    i64 %3)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vfrec7.nxv8f16(
+  <vscale x 8 x half>,
+  i64);
+
+define <vscale x 8 x half> @intrinsic_vfrec7_v_nxv8f16_nxv8f16(<vscale x 8 x half> %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv8f16_nxv8f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,m2,ta,mu
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vfrec7.nxv8f16(
+    <vscale x 8 x half> %0,
+    i64 %1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vfrec7.mask.nxv8f16(
+  <vscale x 8 x half>,
+  <vscale x 8 x half>,
+  <vscale x 8 x i1>,
+  i64);
+
+define <vscale x 8 x half> @intrinsic_vfrec7_mask_v_nxv8f16_nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv8f16_nxv8f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,m2,tu,mu
+; CHECK-NEXT:    vfrec7.v v8, v10, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vfrec7.mask.nxv8f16(
+    <vscale x 8 x half> %1,
+    <vscale x 8 x half> %2,
+    <vscale x 8 x i1> %0,
+    i64 %3)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vfrec7.nxv16f16(
+  <vscale x 16 x half>,
+  i64);
+
+define <vscale x 16 x half> @intrinsic_vfrec7_v_nxv16f16_nxv16f16(<vscale x 16 x half> %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv16f16_nxv16f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,m4,ta,mu
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vfrec7.nxv16f16(
+    <vscale x 16 x half> %0,
+    i64 %1)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vfrec7.mask.nxv16f16(
+  <vscale x 16 x half>,
+  <vscale x 16 x half>,
+  <vscale x 16 x i1>,
+  i64);
+
+define <vscale x 16 x half> @intrinsic_vfrec7_mask_v_nxv16f16_nxv16f16(<vscale x 16 x i1> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv16f16_nxv16f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,m4,tu,mu
+; CHECK-NEXT:    vfrec7.v v8, v12, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vfrec7.mask.nxv16f16(
+    <vscale x 16 x half> %1,
+    <vscale x 16 x half> %2,
+    <vscale x 16 x i1> %0,
+    i64 %3)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vfrec7.nxv32f16(
+  <vscale x 32 x half>,
+  i64);
+
+define <vscale x 32 x half> @intrinsic_vfrec7_v_nxv32f16_nxv32f16(<vscale x 32 x half> %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv32f16_nxv32f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,m8,ta,mu
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vfrec7.nxv32f16(
+    <vscale x 32 x half> %0,
+    i64 %1)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vfrec7.mask.nxv32f16(
+  <vscale x 32 x half>,
+  <vscale x 32 x half>,
+  <vscale x 32 x i1>,
+  i64);
+
+define <vscale x 32 x half> @intrinsic_vfrec7_mask_v_nxv32f16_nxv32f16(<vscale x 32 x i1> %0, <vscale x 32 x half> %1, <vscale x 32 x half> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv32f16_nxv32f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,m8,tu,mu
+; CHECK-NEXT:    vfrec7.v v8, v16, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vfrec7.mask.nxv32f16(
+    <vscale x 32 x half> %1,
+    <vscale x 32 x half> %2,
+    <vscale x 32 x i1> %0,
+    i64 %3)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfrec7.nxv1f32(
+  <vscale x 1 x float>,
+  i64);
+
+define <vscale x 1 x float> @intrinsic_vfrec7_v_nxv1f32_nxv1f32(<vscale x 1 x float> %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv1f32_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,mf2,ta,mu
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfrec7.nxv1f32(
+    <vscale x 1 x float> %0,
+    i64 %1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfrec7.mask.nxv1f32(
+  <vscale x 1 x float>,
+  <vscale x 1 x float>,
+  <vscale x 1 x i1>,
+  i64);
+
+define <vscale x 1 x float> @intrinsic_vfrec7_mask_v_nxv1f32_nxv1f32(<vscale x 1 x i1> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv1f32_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,mf2,tu,mu
+; CHECK-NEXT:    vfrec7.v v8, v9, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfrec7.mask.nxv1f32(
+    <vscale x 1 x float> %1,
+    <vscale x 1 x float> %2,
+    <vscale x 1 x i1> %0,
+    i64 %3)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfrec7.nxv2f32(
+  <vscale x 2 x float>,
+  i64);
+
+define <vscale x 2 x float> @intrinsic_vfrec7_v_nxv2f32_nxv2f32(<vscale x 2 x float> %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv2f32_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,m1,ta,mu
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfrec7.nxv2f32(
+    <vscale x 2 x float> %0,
+    i64 %1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfrec7.mask.nxv2f32(
+  <vscale x 2 x float>,
+  <vscale x 2 x float>,
+  <vscale x 2 x i1>,
+  i64);
+
+define <vscale x 2 x float> @intrinsic_vfrec7_mask_v_nxv2f32_nxv2f32(<vscale x 2 x i1> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv2f32_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,m1,tu,mu
+; CHECK-NEXT:    vfrec7.v v8, v9, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfrec7.mask.nxv2f32(
+    <vscale x 2 x float> %1,
+    <vscale x 2 x float> %2,
+    <vscale x 2 x i1> %0,
+    i64 %3)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfrec7.nxv4f32(
+  <vscale x 4 x float>,
+  i64);
+
+define <vscale x 4 x float> @intrinsic_vfrec7_v_nxv4f32_nxv4f32(<vscale x 4 x float> %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv4f32_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,m2,ta,mu
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfrec7.nxv4f32(
+    <vscale x 4 x float> %0,
+    i64 %1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfrec7.mask.nxv4f32(
+  <vscale x 4 x float>,
+  <vscale x 4 x float>,
+  <vscale x 4 x i1>,
+  i64);
+
+define <vscale x 4 x float> @intrinsic_vfrec7_mask_v_nxv4f32_nxv4f32(<vscale x 4 x i1> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv4f32_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,m2,tu,mu
+; CHECK-NEXT:    vfrec7.v v8, v10, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfrec7.mask.nxv4f32(
+    <vscale x 4 x float> %1,
+    <vscale x 4 x float> %2,
+    <vscale x 4 x i1> %0,
+    i64 %3)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfrec7.nxv8f32(
+  <vscale x 8 x float>,
+  i64);
+
+define <vscale x 8 x float> @intrinsic_vfrec7_v_nxv8f32_nxv8f32(<vscale x 8 x float> %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv8f32_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,m4,ta,mu
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfrec7.nxv8f32(
+    <vscale x 8 x float> %0,
+    i64 %1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfrec7.mask.nxv8f32(
+  <vscale x 8 x float>,
+  <vscale x 8 x float>,
+  <vscale x 8 x i1>,
+  i64);
+
+define <vscale x 8 x float> @intrinsic_vfrec7_mask_v_nxv8f32_nxv8f32(<vscale x 8 x i1> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv8f32_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,m4,tu,mu
+; CHECK-NEXT:    vfrec7.v v8, v12, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfrec7.mask.nxv8f32(
+    <vscale x 8 x float> %1,
+    <vscale x 8 x float> %2,
+    <vscale x 8 x i1> %0,
+    i64 %3)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfrec7.nxv16f32(
+  <vscale x 16 x float>,
+  i64);
+
+define <vscale x 16 x float> @intrinsic_vfrec7_v_nxv16f32_nxv16f32(<vscale x 16 x float> %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv16f32_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,m8,ta,mu
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfrec7.nxv16f32(
+    <vscale x 16 x float> %0,
+    i64 %1)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfrec7.mask.nxv16f32(
+  <vscale x 16 x float>,
+  <vscale x 16 x float>,
+  <vscale x 16 x i1>,
+  i64);
+
+define <vscale x 16 x float> @intrinsic_vfrec7_mask_v_nxv16f32_nxv16f32(<vscale x 16 x i1> %0, <vscale x 16 x float> %1, <vscale x 16 x float> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv16f32_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,m8,tu,mu
+; CHECK-NEXT:    vfrec7.v v8, v16, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfrec7.mask.nxv16f32(
+    <vscale x 16 x float> %1,
+    <vscale x 16 x float> %2,
+    <vscale x 16 x i1> %0,
+    i64 %3)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vfrec7.nxv1f64(
+  <vscale x 1 x double>,
+  i64);
+
+define <vscale x 1 x double> @intrinsic_vfrec7_v_nxv1f64_nxv1f64(<vscale x 1 x double> %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv1f64_nxv1f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e64,m1,ta,mu
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vfrec7.nxv1f64(
+    <vscale x 1 x double> %0,
+    i64 %1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vfrec7.mask.nxv1f64(
+  <vscale x 1 x double>,
+  <vscale x 1 x double>,
+  <vscale x 1 x i1>,
+  i64);
+
+define <vscale x 1 x double> @intrinsic_vfrec7_mask_v_nxv1f64_nxv1f64(<vscale x 1 x i1> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv1f64_nxv1f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e64,m1,tu,mu
+; CHECK-NEXT:    vfrec7.v v8, v9, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vfrec7.mask.nxv1f64(
+    <vscale x 1 x double> %1,
+    <vscale x 1 x double> %2,
+    <vscale x 1 x i1> %0,
+    i64 %3)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vfrec7.nxv2f64(
+  <vscale x 2 x double>,
+  i64);
+
+define <vscale x 2 x double> @intrinsic_vfrec7_v_nxv2f64_nxv2f64(<vscale x 2 x double> %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv2f64_nxv2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e64,m2,ta,mu
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vfrec7.nxv2f64(
+    <vscale x 2 x double> %0,
+    i64 %1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vfrec7.mask.nxv2f64(
+  <vscale x 2 x double>,
+  <vscale x 2 x double>,
+  <vscale x 2 x i1>,
+  i64);
+
+define <vscale x 2 x double> @intrinsic_vfrec7_mask_v_nxv2f64_nxv2f64(<vscale x 2 x i1> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv2f64_nxv2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e64,m2,tu,mu
+; CHECK-NEXT:    vfrec7.v v8, v10, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vfrec7.mask.nxv2f64(
+    <vscale x 2 x double> %1,
+    <vscale x 2 x double> %2,
+    <vscale x 2 x i1> %0,
+    i64 %3)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vfrec7.nxv4f64(
+  <vscale x 4 x double>,
+  i64);
+
+define <vscale x 4 x double> @intrinsic_vfrec7_v_nxv4f64_nxv4f64(<vscale x 4 x double> %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv4f64_nxv4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e64,m4,ta,mu
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vfrec7.nxv4f64(
+    <vscale x 4 x double> %0,
+    i64 %1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vfrec7.mask.nxv4f64(
+  <vscale x 4 x double>,
+  <vscale x 4 x double>,
+  <vscale x 4 x i1>,
+  i64);
+
+define <vscale x 4 x double> @intrinsic_vfrec7_mask_v_nxv4f64_nxv4f64(<vscale x 4 x i1> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv4f64_nxv4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e64,m4,tu,mu
+; CHECK-NEXT:    vfrec7.v v8, v12, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vfrec7.mask.nxv4f64(
+    <vscale x 4 x double> %1,
+    <vscale x 4 x double> %2,
+    <vscale x 4 x i1> %0,
+    i64 %3)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vfrec7.nxv8f64(
+  <vscale x 8 x double>,
+  i64);
+
+define <vscale x 8 x double> @intrinsic_vfrec7_v_nxv8f64_nxv8f64(<vscale x 8 x double> %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv8f64_nxv8f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e64,m8,ta,mu
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vfrec7.nxv8f64(
+    <vscale x 8 x double> %0,
+    i64 %1)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vfrec7.mask.nxv8f64(
+  <vscale x 8 x double>,
+  <vscale x 8 x double>,
+  <vscale x 8 x i1>,
+  i64);
+
+define <vscale x 8 x double> @intrinsic_vfrec7_mask_v_nxv8f64_nxv8f64(<vscale x 8 x i1> %0, <vscale x 8 x double> %1, <vscale x 8 x double> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv8f64_nxv8f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e64,m8,tu,mu
+; CHECK-NEXT:    vfrec7.v v8, v16, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vfrec7.mask.nxv8f64(
+    <vscale x 8 x double> %1,
+    <vscale x 8 x double> %2,
+    <vscale x 8 x i1> %0,
+    i64 %3)
+
+  ret <vscale x 8 x double> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfrsqrt7-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vfrsqrt7-rv32.ll
new file mode 100644
index 000000000000..2740ecf3acff
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfrsqrt7-rv32.ll
@@ -0,0 +1,602 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-v,+d,+experimental-zfh -verify-machineinstrs \
+; RUN:   --riscv-no-aliases < %s | FileCheck %s
+declare <vscale x 1 x half> @llvm.riscv.vfrsqrt7.nxv1f16(
+  <vscale x 1 x half>,
+  i32);
+
+define <vscale x 1 x half> @intrinsic_vfrsqrt7_v_nxv1f16_nxv1f16(<vscale x 1 x half> %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv1f16_nxv1f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,mf4,ta,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vfrsqrt7.nxv1f16(
+    <vscale x 1 x half> %0,
+    i32 %1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vfrsqrt7.mask.nxv1f16(
+  <vscale x 1 x half>,
+  <vscale x 1 x half>,
+  <vscale x 1 x i1>,
+  i32);
+
+define <vscale x 1 x half> @intrinsic_vfrsqrt7_mask_v_nxv1f16_nxv1f16(<vscale x 1 x i1> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, i32 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv1f16_nxv1f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,mf4,tu,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v9, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vfrsqrt7.mask.nxv1f16(
+    <vscale x 1 x half> %1,
+    <vscale x 1 x half> %2,
+    <vscale x 1 x i1> %0,
+    i32 %3)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vfrsqrt7.nxv2f16(
+  <vscale x 2 x half>,
+  i32);
+
+define <vscale x 2 x half> @intrinsic_vfrsqrt7_v_nxv2f16_nxv2f16(<vscale x 2 x half> %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv2f16_nxv2f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,mf2,ta,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vfrsqrt7.nxv2f16(
+    <vscale x 2 x half> %0,
+    i32 %1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vfrsqrt7.mask.nxv2f16(
+  <vscale x 2 x half>,
+  <vscale x 2 x half>,
+  <vscale x 2 x i1>,
+  i32);
+
+define <vscale x 2 x half> @intrinsic_vfrsqrt7_mask_v_nxv2f16_nxv2f16(<vscale x 2 x i1> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, i32 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv2f16_nxv2f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,mf2,tu,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v9, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vfrsqrt7.mask.nxv2f16(
+    <vscale x 2 x half> %1,
+    <vscale x 2 x half> %2,
+    <vscale x 2 x i1> %0,
+    i32 %3)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vfrsqrt7.nxv4f16(
+  <vscale x 4 x half>,
+  i32);
+
+define <vscale x 4 x half> @intrinsic_vfrsqrt7_v_nxv4f16_nxv4f16(<vscale x 4 x half> %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv4f16_nxv4f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,m1,ta,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vfrsqrt7.nxv4f16(
+    <vscale x 4 x half> %0,
+    i32 %1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vfrsqrt7.mask.nxv4f16(
+  <vscale x 4 x half>,
+  <vscale x 4 x half>,
+  <vscale x 4 x i1>,
+  i32);
+
+define <vscale x 4 x half> @intrinsic_vfrsqrt7_mask_v_nxv4f16_nxv4f16(<vscale x 4 x i1> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, i32 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv4f16_nxv4f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,m1,tu,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v9, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vfrsqrt7.mask.nxv4f16(
+    <vscale x 4 x half> %1,
+    <vscale x 4 x half> %2,
+    <vscale x 4 x i1> %0,
+    i32 %3)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vfrsqrt7.nxv8f16(
+  <vscale x 8 x half>,
+  i32);
+
+define <vscale x 8 x half> @intrinsic_vfrsqrt7_v_nxv8f16_nxv8f16(<vscale x 8 x half> %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv8f16_nxv8f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,m2,ta,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vfrsqrt7.nxv8f16(
+    <vscale x 8 x half> %0,
+    i32 %1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vfrsqrt7.mask.nxv8f16(
+  <vscale x 8 x half>,
+  <vscale x 8 x half>,
+  <vscale x 8 x i1>,
+  i32);
+
+define <vscale x 8 x half> @intrinsic_vfrsqrt7_mask_v_nxv8f16_nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, i32 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv8f16_nxv8f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,m2,tu,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v10, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vfrsqrt7.mask.nxv8f16(
+    <vscale x 8 x half> %1,
+    <vscale x 8 x half> %2,
+    <vscale x 8 x i1> %0,
+    i32 %3)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vfrsqrt7.nxv16f16(
+  <vscale x 16 x half>,
+  i32);
+
+define <vscale x 16 x half> @intrinsic_vfrsqrt7_v_nxv16f16_nxv16f16(<vscale x 16 x half> %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv16f16_nxv16f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,m4,ta,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vfrsqrt7.nxv16f16(
+    <vscale x 16 x half> %0,
+    i32 %1)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vfrsqrt7.mask.nxv16f16(
+  <vscale x 16 x half>,
+  <vscale x 16 x half>,
+  <vscale x 16 x i1>,
+  i32);
+
+define <vscale x 16 x half> @intrinsic_vfrsqrt7_mask_v_nxv16f16_nxv16f16(<vscale x 16 x i1> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, i32 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv16f16_nxv16f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,m4,tu,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v12, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vfrsqrt7.mask.nxv16f16(
+    <vscale x 16 x half> %1,
+    <vscale x 16 x half> %2,
+    <vscale x 16 x i1> %0,
+    i32 %3)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vfrsqrt7.nxv32f16(
+  <vscale x 32 x half>,
+  i32);
+
+define <vscale x 32 x half> @intrinsic_vfrsqrt7_v_nxv32f16_nxv32f16(<vscale x 32 x half> %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv32f16_nxv32f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,m8,ta,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vfrsqrt7.nxv32f16(
+    <vscale x 32 x half> %0,
+    i32 %1)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vfrsqrt7.mask.nxv32f16(
+  <vscale x 32 x half>,
+  <vscale x 32 x half>,
+  <vscale x 32 x i1>,
+  i32);
+
+define <vscale x 32 x half> @intrinsic_vfrsqrt7_mask_v_nxv32f16_nxv32f16(<vscale x 32 x i1> %0, <vscale x 32 x half> %1, <vscale x 32 x half> %2, i32 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv32f16_nxv32f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,m8,tu,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v16, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vfrsqrt7.mask.nxv32f16(
+    <vscale x 32 x half> %1,
+    <vscale x 32 x half> %2,
+    <vscale x 32 x i1> %0,
+    i32 %3)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfrsqrt7.nxv1f32(
+  <vscale x 1 x float>,
+  i32);
+
+define <vscale x 1 x float> @intrinsic_vfrsqrt7_v_nxv1f32_nxv1f32(<vscale x 1 x float> %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv1f32_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,mf2,ta,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfrsqrt7.nxv1f32(
+    <vscale x 1 x float> %0,
+    i32 %1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfrsqrt7.mask.nxv1f32(
+  <vscale x 1 x float>,
+  <vscale x 1 x float>,
+  <vscale x 1 x i1>,
+  i32);
+
+define <vscale x 1 x float> @intrinsic_vfrsqrt7_mask_v_nxv1f32_nxv1f32(<vscale x 1 x i1> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, i32 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv1f32_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,mf2,tu,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v9, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfrsqrt7.mask.nxv1f32(
+    <vscale x 1 x float> %1,
+    <vscale x 1 x float> %2,
+    <vscale x 1 x i1> %0,
+    i32 %3)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfrsqrt7.nxv2f32(
+  <vscale x 2 x float>,
+  i32);
+
+define <vscale x 2 x float> @intrinsic_vfrsqrt7_v_nxv2f32_nxv2f32(<vscale x 2 x float> %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv2f32_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,m1,ta,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfrsqrt7.nxv2f32(
+    <vscale x 2 x float> %0,
+    i32 %1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfrsqrt7.mask.nxv2f32(
+  <vscale x 2 x float>,
+  <vscale x 2 x float>,
+  <vscale x 2 x i1>,
+  i32);
+
+define <vscale x 2 x float> @intrinsic_vfrsqrt7_mask_v_nxv2f32_nxv2f32(<vscale x 2 x i1> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, i32 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv2f32_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,m1,tu,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v9, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfrsqrt7.mask.nxv2f32(
+    <vscale x 2 x float> %1,
+    <vscale x 2 x float> %2,
+    <vscale x 2 x i1> %0,
+    i32 %3)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfrsqrt7.nxv4f32(
+  <vscale x 4 x float>,
+  i32);
+
+define <vscale x 4 x float> @intrinsic_vfrsqrt7_v_nxv4f32_nxv4f32(<vscale x 4 x float> %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv4f32_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,m2,ta,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfrsqrt7.nxv4f32(
+    <vscale x 4 x float> %0,
+    i32 %1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfrsqrt7.mask.nxv4f32(
+  <vscale x 4 x float>,
+  <vscale x 4 x float>,
+  <vscale x 4 x i1>,
+  i32);
+
+define <vscale x 4 x float> @intrinsic_vfrsqrt7_mask_v_nxv4f32_nxv4f32(<vscale x 4 x i1> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, i32 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv4f32_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,m2,tu,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v10, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfrsqrt7.mask.nxv4f32(
+    <vscale x 4 x float> %1,
+    <vscale x 4 x float> %2,
+    <vscale x 4 x i1> %0,
+    i32 %3)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfrsqrt7.nxv8f32(
+  <vscale x 8 x float>,
+  i32);
+
+define <vscale x 8 x float> @intrinsic_vfrsqrt7_v_nxv8f32_nxv8f32(<vscale x 8 x float> %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv8f32_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,m4,ta,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfrsqrt7.nxv8f32(
+    <vscale x 8 x float> %0,
+    i32 %1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfrsqrt7.mask.nxv8f32(
+  <vscale x 8 x float>,
+  <vscale x 8 x float>,
+  <vscale x 8 x i1>,
+  i32);
+
+define <vscale x 8 x float> @intrinsic_vfrsqrt7_mask_v_nxv8f32_nxv8f32(<vscale x 8 x i1> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, i32 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv8f32_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,m4,tu,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v12, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfrsqrt7.mask.nxv8f32(
+    <vscale x 8 x float> %1,
+    <vscale x 8 x float> %2,
+    <vscale x 8 x i1> %0,
+    i32 %3)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfrsqrt7.nxv16f32(
+  <vscale x 16 x float>,
+  i32);
+
+define <vscale x 16 x float> @intrinsic_vfrsqrt7_v_nxv16f32_nxv16f32(<vscale x 16 x float> %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv16f32_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,m8,ta,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfrsqrt7.nxv16f32(
+    <vscale x 16 x float> %0,
+    i32 %1)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfrsqrt7.mask.nxv16f32(
+  <vscale x 16 x float>,
+  <vscale x 16 x float>,
+  <vscale x 16 x i1>,
+  i32);
+
+define <vscale x 16 x float> @intrinsic_vfrsqrt7_mask_v_nxv16f32_nxv16f32(<vscale x 16 x i1> %0, <vscale x 16 x float> %1, <vscale x 16 x float> %2, i32 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv16f32_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,m8,tu,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v16, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfrsqrt7.mask.nxv16f32(
+    <vscale x 16 x float> %1,
+    <vscale x 16 x float> %2,
+    <vscale x 16 x i1> %0,
+    i32 %3)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vfrsqrt7.nxv1f64(
+  <vscale x 1 x double>,
+  i32);
+
+define <vscale x 1 x double> @intrinsic_vfrsqrt7_v_nxv1f64_nxv1f64(<vscale x 1 x double> %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv1f64_nxv1f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e64,m1,ta,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vfrsqrt7.nxv1f64(
+    <vscale x 1 x double> %0,
+    i32 %1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vfrsqrt7.mask.nxv1f64(
+  <vscale x 1 x double>,
+  <vscale x 1 x double>,
+  <vscale x 1 x i1>,
+  i32);
+
+define <vscale x 1 x double> @intrinsic_vfrsqrt7_mask_v_nxv1f64_nxv1f64(<vscale x 1 x i1> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, i32 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv1f64_nxv1f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e64,m1,tu,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v9, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vfrsqrt7.mask.nxv1f64(
+    <vscale x 1 x double> %1,
+    <vscale x 1 x double> %2,
+    <vscale x 1 x i1> %0,
+    i32 %3)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vfrsqrt7.nxv2f64(
+  <vscale x 2 x double>,
+  i32);
+
+define <vscale x 2 x double> @intrinsic_vfrsqrt7_v_nxv2f64_nxv2f64(<vscale x 2 x double> %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv2f64_nxv2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e64,m2,ta,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vfrsqrt7.nxv2f64(
+    <vscale x 2 x double> %0,
+    i32 %1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vfrsqrt7.mask.nxv2f64(
+  <vscale x 2 x double>,
+  <vscale x 2 x double>,
+  <vscale x 2 x i1>,
+  i32);
+
+define <vscale x 2 x double> @intrinsic_vfrsqrt7_mask_v_nxv2f64_nxv2f64(<vscale x 2 x i1> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, i32 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv2f64_nxv2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e64,m2,tu,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v10, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vfrsqrt7.mask.nxv2f64(
+    <vscale x 2 x double> %1,
+    <vscale x 2 x double> %2,
+    <vscale x 2 x i1> %0,
+    i32 %3)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vfrsqrt7.nxv4f64(
+  <vscale x 4 x double>,
+  i32);
+
+define <vscale x 4 x double> @intrinsic_vfrsqrt7_v_nxv4f64_nxv4f64(<vscale x 4 x double> %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv4f64_nxv4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e64,m4,ta,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vfrsqrt7.nxv4f64(
+    <vscale x 4 x double> %0,
+    i32 %1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vfrsqrt7.mask.nxv4f64(
+  <vscale x 4 x double>,
+  <vscale x 4 x double>,
+  <vscale x 4 x i1>,
+  i32);
+
+define <vscale x 4 x double> @intrinsic_vfrsqrt7_mask_v_nxv4f64_nxv4f64(<vscale x 4 x i1> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, i32 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv4f64_nxv4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e64,m4,tu,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v12, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vfrsqrt7.mask.nxv4f64(
+    <vscale x 4 x double> %1,
+    <vscale x 4 x double> %2,
+    <vscale x 4 x i1> %0,
+    i32 %3)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vfrsqrt7.nxv8f64(
+  <vscale x 8 x double>,
+  i32);
+
+define <vscale x 8 x double> @intrinsic_vfrsqrt7_v_nxv8f64_nxv8f64(<vscale x 8 x double> %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv8f64_nxv8f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e64,m8,ta,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vfrsqrt7.nxv8f64(
+    <vscale x 8 x double> %0,
+    i32 %1)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vfrsqrt7.mask.nxv8f64(
+  <vscale x 8 x double>,
+  <vscale x 8 x double>,
+  <vscale x 8 x i1>,
+  i32);
+
+define <vscale x 8 x double> @intrinsic_vfrsqrt7_mask_v_nxv8f64_nxv8f64(<vscale x 8 x i1> %0, <vscale x 8 x double> %1, <vscale x 8 x double> %2, i32 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv8f64_nxv8f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e64,m8,tu,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v16, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vfrsqrt7.mask.nxv8f64(
+    <vscale x 8 x double> %1,
+    <vscale x 8 x double> %2,
+    <vscale x 8 x i1> %0,
+    i32 %3)
+
+  ret <vscale x 8 x double> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfrsqrt7-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vfrsqrt7-rv64.ll
new file mode 100644
index 000000000000..3ea0f1d9eb6a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfrsqrt7-rv64.ll
@@ -0,0 +1,602 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-v,+d,+experimental-zfh -verify-machineinstrs \
+; RUN:   --riscv-no-aliases < %s | FileCheck %s
+declare <vscale x 1 x half> @llvm.riscv.vfrsqrt7.nxv1f16(
+  <vscale x 1 x half>,
+  i64);
+
+define <vscale x 1 x half> @intrinsic_vfrsqrt7_v_nxv1f16_nxv1f16(<vscale x 1 x half> %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv1f16_nxv1f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,mf4,ta,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vfrsqrt7.nxv1f16(
+    <vscale x 1 x half> %0,
+    i64 %1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vfrsqrt7.mask.nxv1f16(
+  <vscale x 1 x half>,
+  <vscale x 1 x half>,
+  <vscale x 1 x i1>,
+  i64);
+
+define <vscale x 1 x half> @intrinsic_vfrsqrt7_mask_v_nxv1f16_nxv1f16(<vscale x 1 x i1> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv1f16_nxv1f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,mf4,tu,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v9, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vfrsqrt7.mask.nxv1f16(
+    <vscale x 1 x half> %1,
+    <vscale x 1 x half> %2,
+    <vscale x 1 x i1> %0,
+    i64 %3)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vfrsqrt7.nxv2f16(
+  <vscale x 2 x half>,
+  i64);
+
+define <vscale x 2 x half> @intrinsic_vfrsqrt7_v_nxv2f16_nxv2f16(<vscale x 2 x half> %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv2f16_nxv2f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,mf2,ta,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vfrsqrt7.nxv2f16(
+    <vscale x 2 x half> %0,
+    i64 %1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vfrsqrt7.mask.nxv2f16(
+  <vscale x 2 x half>,
+  <vscale x 2 x half>,
+  <vscale x 2 x i1>,
+  i64);
+
+define <vscale x 2 x half> @intrinsic_vfrsqrt7_mask_v_nxv2f16_nxv2f16(<vscale x 2 x i1> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv2f16_nxv2f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,mf2,tu,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v9, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vfrsqrt7.mask.nxv2f16(
+    <vscale x 2 x half> %1,
+    <vscale x 2 x half> %2,
+    <vscale x 2 x i1> %0,
+    i64 %3)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vfrsqrt7.nxv4f16(
+  <vscale x 4 x half>,
+  i64);
+
+define <vscale x 4 x half> @intrinsic_vfrsqrt7_v_nxv4f16_nxv4f16(<vscale x 4 x half> %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv4f16_nxv4f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,m1,ta,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vfrsqrt7.nxv4f16(
+    <vscale x 4 x half> %0,
+    i64 %1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vfrsqrt7.mask.nxv4f16(
+  <vscale x 4 x half>,
+  <vscale x 4 x half>,
+  <vscale x 4 x i1>,
+  i64);
+
+define <vscale x 4 x half> @intrinsic_vfrsqrt7_mask_v_nxv4f16_nxv4f16(<vscale x 4 x i1> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv4f16_nxv4f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,m1,tu,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v9, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vfrsqrt7.mask.nxv4f16(
+    <vscale x 4 x half> %1,
+    <vscale x 4 x half> %2,
+    <vscale x 4 x i1> %0,
+    i64 %3)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vfrsqrt7.nxv8f16(
+  <vscale x 8 x half>,
+  i64);
+
+define <vscale x 8 x half> @intrinsic_vfrsqrt7_v_nxv8f16_nxv8f16(<vscale x 8 x half> %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv8f16_nxv8f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,m2,ta,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vfrsqrt7.nxv8f16(
+    <vscale x 8 x half> %0,
+    i64 %1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vfrsqrt7.mask.nxv8f16(
+  <vscale x 8 x half>,
+  <vscale x 8 x half>,
+  <vscale x 8 x i1>,
+  i64);
+
+define <vscale x 8 x half> @intrinsic_vfrsqrt7_mask_v_nxv8f16_nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv8f16_nxv8f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,m2,tu,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v10, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vfrsqrt7.mask.nxv8f16(
+    <vscale x 8 x half> %1,
+    <vscale x 8 x half> %2,
+    <vscale x 8 x i1> %0,
+    i64 %3)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vfrsqrt7.nxv16f16(
+  <vscale x 16 x half>,
+  i64);
+
+define <vscale x 16 x half> @intrinsic_vfrsqrt7_v_nxv16f16_nxv16f16(<vscale x 16 x half> %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv16f16_nxv16f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,m4,ta,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vfrsqrt7.nxv16f16(
+    <vscale x 16 x half> %0,
+    i64 %1)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vfrsqrt7.mask.nxv16f16(
+  <vscale x 16 x half>,
+  <vscale x 16 x half>,
+  <vscale x 16 x i1>,
+  i64);
+
+define <vscale x 16 x half> @intrinsic_vfrsqrt7_mask_v_nxv16f16_nxv16f16(<vscale x 16 x i1> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv16f16_nxv16f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,m4,tu,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v12, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vfrsqrt7.mask.nxv16f16(
+    <vscale x 16 x half> %1,
+    <vscale x 16 x half> %2,
+    <vscale x 16 x i1> %0,
+    i64 %3)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vfrsqrt7.nxv32f16(
+  <vscale x 32 x half>,
+  i64);
+
+define <vscale x 32 x half> @intrinsic_vfrsqrt7_v_nxv32f16_nxv32f16(<vscale x 32 x half> %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv32f16_nxv32f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,m8,ta,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vfrsqrt7.nxv32f16(
+    <vscale x 32 x half> %0,
+    i64 %1)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vfrsqrt7.mask.nxv32f16(
+  <vscale x 32 x half>,
+  <vscale x 32 x half>,
+  <vscale x 32 x i1>,
+  i64);
+
+define <vscale x 32 x half> @intrinsic_vfrsqrt7_mask_v_nxv32f16_nxv32f16(<vscale x 32 x i1> %0, <vscale x 32 x half> %1, <vscale x 32 x half> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv32f16_nxv32f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e16,m8,tu,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v16, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vfrsqrt7.mask.nxv32f16(
+    <vscale x 32 x half> %1,
+    <vscale x 32 x half> %2,
+    <vscale x 32 x i1> %0,
+    i64 %3)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfrsqrt7.nxv1f32(
+  <vscale x 1 x float>,
+  i64);
+
+define <vscale x 1 x float> @intrinsic_vfrsqrt7_v_nxv1f32_nxv1f32(<vscale x 1 x float> %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv1f32_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,mf2,ta,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfrsqrt7.nxv1f32(
+    <vscale x 1 x float> %0,
+    i64 %1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfrsqrt7.mask.nxv1f32(
+  <vscale x 1 x float>,
+  <vscale x 1 x float>,
+  <vscale x 1 x i1>,
+  i64);
+
+define <vscale x 1 x float> @intrinsic_vfrsqrt7_mask_v_nxv1f32_nxv1f32(<vscale x 1 x i1> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv1f32_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,mf2,tu,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v9, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfrsqrt7.mask.nxv1f32(
+    <vscale x 1 x float> %1,
+    <vscale x 1 x float> %2,
+    <vscale x 1 x i1> %0,
+    i64 %3)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfrsqrt7.nxv2f32(
+  <vscale x 2 x float>,
+  i64);
+
+define <vscale x 2 x float> @intrinsic_vfrsqrt7_v_nxv2f32_nxv2f32(<vscale x 2 x float> %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv2f32_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,m1,ta,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfrsqrt7.nxv2f32(
+    <vscale x 2 x float> %0,
+    i64 %1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfrsqrt7.mask.nxv2f32(
+  <vscale x 2 x float>,
+  <vscale x 2 x float>,
+  <vscale x 2 x i1>,
+  i64);
+
+define <vscale x 2 x float> @intrinsic_vfrsqrt7_mask_v_nxv2f32_nxv2f32(<vscale x 2 x i1> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv2f32_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,m1,tu,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v9, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfrsqrt7.mask.nxv2f32(
+    <vscale x 2 x float> %1,
+    <vscale x 2 x float> %2,
+    <vscale x 2 x i1> %0,
+    i64 %3)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfrsqrt7.nxv4f32(
+  <vscale x 4 x float>,
+  i64);
+
+define <vscale x 4 x float> @intrinsic_vfrsqrt7_v_nxv4f32_nxv4f32(<vscale x 4 x float> %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv4f32_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,m2,ta,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfrsqrt7.nxv4f32(
+    <vscale x 4 x float> %0,
+    i64 %1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfrsqrt7.mask.nxv4f32(
+  <vscale x 4 x float>,
+  <vscale x 4 x float>,
+  <vscale x 4 x i1>,
+  i64);
+
+define <vscale x 4 x float> @intrinsic_vfrsqrt7_mask_v_nxv4f32_nxv4f32(<vscale x 4 x i1> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv4f32_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,m2,tu,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v10, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfrsqrt7.mask.nxv4f32(
+    <vscale x 4 x float> %1,
+    <vscale x 4 x float> %2,
+    <vscale x 4 x i1> %0,
+    i64 %3)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfrsqrt7.nxv8f32(
+  <vscale x 8 x float>,
+  i64);
+
+define <vscale x 8 x float> @intrinsic_vfrsqrt7_v_nxv8f32_nxv8f32(<vscale x 8 x float> %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv8f32_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,m4,ta,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfrsqrt7.nxv8f32(
+    <vscale x 8 x float> %0,
+    i64 %1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfrsqrt7.mask.nxv8f32(
+  <vscale x 8 x float>,
+  <vscale x 8 x float>,
+  <vscale x 8 x i1>,
+  i64);
+
+define <vscale x 8 x float> @intrinsic_vfrsqrt7_mask_v_nxv8f32_nxv8f32(<vscale x 8 x i1> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv8f32_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,m4,tu,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v12, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfrsqrt7.mask.nxv8f32(
+    <vscale x 8 x float> %1,
+    <vscale x 8 x float> %2,
+    <vscale x 8 x i1> %0,
+    i64 %3)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfrsqrt7.nxv16f32(
+  <vscale x 16 x float>,
+  i64);
+
+define <vscale x 16 x float> @intrinsic_vfrsqrt7_v_nxv16f32_nxv16f32(<vscale x 16 x float> %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv16f32_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,m8,ta,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfrsqrt7.nxv16f32(
+    <vscale x 16 x float> %0,
+    i64 %1)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfrsqrt7.mask.nxv16f32(
+  <vscale x 16 x float>,
+  <vscale x 16 x float>,
+  <vscale x 16 x i1>,
+  i64);
+
+define <vscale x 16 x float> @intrinsic_vfrsqrt7_mask_v_nxv16f32_nxv16f32(<vscale x 16 x i1> %0, <vscale x 16 x float> %1, <vscale x 16 x float> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv16f32_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e32,m8,tu,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v16, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfrsqrt7.mask.nxv16f32(
+    <vscale x 16 x float> %1,
+    <vscale x 16 x float> %2,
+    <vscale x 16 x i1> %0,
+    i64 %3)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vfrsqrt7.nxv1f64(
+  <vscale x 1 x double>,
+  i64);
+
+define <vscale x 1 x double> @intrinsic_vfrsqrt7_v_nxv1f64_nxv1f64(<vscale x 1 x double> %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv1f64_nxv1f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e64,m1,ta,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vfrsqrt7.nxv1f64(
+    <vscale x 1 x double> %0,
+    i64 %1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vfrsqrt7.mask.nxv1f64(
+  <vscale x 1 x double>,
+  <vscale x 1 x double>,
+  <vscale x 1 x i1>,
+  i64);
+
+define <vscale x 1 x double> @intrinsic_vfrsqrt7_mask_v_nxv1f64_nxv1f64(<vscale x 1 x i1> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv1f64_nxv1f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e64,m1,tu,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v9, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vfrsqrt7.mask.nxv1f64(
+    <vscale x 1 x double> %1,
+    <vscale x 1 x double> %2,
+    <vscale x 1 x i1> %0,
+    i64 %3)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vfrsqrt7.nxv2f64(
+  <vscale x 2 x double>,
+  i64);
+
+define <vscale x 2 x double> @intrinsic_vfrsqrt7_v_nxv2f64_nxv2f64(<vscale x 2 x double> %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv2f64_nxv2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e64,m2,ta,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vfrsqrt7.nxv2f64(
+    <vscale x 2 x double> %0,
+    i64 %1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vfrsqrt7.mask.nxv2f64(
+  <vscale x 2 x double>,
+  <vscale x 2 x double>,
+  <vscale x 2 x i1>,
+  i64);
+
+define <vscale x 2 x double> @intrinsic_vfrsqrt7_mask_v_nxv2f64_nxv2f64(<vscale x 2 x i1> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv2f64_nxv2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e64,m2,tu,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v10, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vfrsqrt7.mask.nxv2f64(
+    <vscale x 2 x double> %1,
+    <vscale x 2 x double> %2,
+    <vscale x 2 x i1> %0,
+    i64 %3)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vfrsqrt7.nxv4f64(
+  <vscale x 4 x double>,
+  i64);
+
+define <vscale x 4 x double> @intrinsic_vfrsqrt7_v_nxv4f64_nxv4f64(<vscale x 4 x double> %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv4f64_nxv4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e64,m4,ta,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vfrsqrt7.nxv4f64(
+    <vscale x 4 x double> %0,
+    i64 %1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vfrsqrt7.mask.nxv4f64(
+  <vscale x 4 x double>,
+  <vscale x 4 x double>,
+  <vscale x 4 x i1>,
+  i64);
+
+define <vscale x 4 x double> @intrinsic_vfrsqrt7_mask_v_nxv4f64_nxv4f64(<vscale x 4 x i1> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv4f64_nxv4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e64,m4,tu,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v12, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vfrsqrt7.mask.nxv4f64(
+    <vscale x 4 x double> %1,
+    <vscale x 4 x double> %2,
+    <vscale x 4 x i1> %0,
+    i64 %3)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vfrsqrt7.nxv8f64(
+  <vscale x 8 x double>,
+  i64);
+
+define <vscale x 8 x double> @intrinsic_vfrsqrt7_v_nxv8f64_nxv8f64(<vscale x 8 x double> %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv8f64_nxv8f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e64,m8,ta,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vfrsqrt7.nxv8f64(
+    <vscale x 8 x double> %0,
+    i64 %1)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vfrsqrt7.mask.nxv8f64(
+  <vscale x 8 x double>,
+  <vscale x 8 x double>,
+  <vscale x 8 x i1>,
+  i64);
+
+define <vscale x 8 x double> @intrinsic_vfrsqrt7_mask_v_nxv8f64_nxv8f64(<vscale x 8 x i1> %0, <vscale x 8 x double> %1, <vscale x 8 x double> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv8f64_nxv8f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, a0, e64,m8,tu,mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v16, v0.t
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vfrsqrt7.mask.nxv8f64(
+    <vscale x 8 x double> %1,
+    <vscale x 8 x double> %2,
+    <vscale x 8 x i1> %0,
+    i64 %3)
+
+  ret <vscale x 8 x double> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vle1-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vle1-rv32.ll
new file mode 100644
index 000000000000..f7040f7885d5
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vle1-rv32.ll
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs \
+; RUN:   --riscv-no-aliases < %s | FileCheck %s
+
+declare <vscale x 1 x i1> @llvm.riscv.vle1.nxv1i1(<vscale x 1 x i1>*, i32);
+
+define <vscale x 1 x i1> @intrinsic_vle1_v_nxv1i1(<vscale x 1 x i1>* %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vle1_v_nxv1i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a1, a1, e8,mf8,ta,mu
+; CHECK-NEXT:    vle1.v v0, (a0)
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 1 x i1> @llvm.riscv.vle1.nxv1i1(<vscale x 1 x i1>* %0, i32 %1)
+  ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vle1.nxv2i1(<vscale x 2 x i1>*, i32);
+
+define <vscale x 2 x i1> @intrinsic_vle1_v_nxv2i1(<vscale x 2 x i1>* %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vle1_v_nxv2i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a1, a1, e8,mf4,ta,mu
+; CHECK-NEXT:    vle1.v v0, (a0)
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 2 x i1> @llvm.riscv.vle1.nxv2i1(<vscale x 2 x i1>* %0, i32 %1)
+  ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vle1.nxv4i1(<vscale x 4 x i1>*, i32);
+
+define <vscale x 4 x i1> @intrinsic_vle1_v_nxv4i1(<vscale x 4 x i1>* %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vle1_v_nxv4i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a1, a1, e8,mf2,ta,mu
+; CHECK-NEXT:    vle1.v v0, (a0)
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 4 x i1> @llvm.riscv.vle1.nxv4i1(<vscale x 4 x i1>* %0, i32 %1)
+  ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vle1.nxv8i1(<vscale x 8 x i1>*, i32);
+
+define <vscale x 8 x i1> @intrinsic_vle1_v_nxv8i1(<vscale x 8 x i1>* %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vle1_v_nxv8i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a1, a1, e8,m1,ta,mu
+; CHECK-NEXT:    vle1.v v0, (a0)
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 8 x i1> @llvm.riscv.vle1.nxv8i1(<vscale x 8 x i1>* %0, i32 %1)
+  ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vle1.nxv16i1(<vscale x 16 x i1>*, i32);
+
+define <vscale x 16 x i1> @intrinsic_vle1_v_nxv16i1(<vscale x 16 x i1>* %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vle1_v_nxv16i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a1, a1, e8,m2,ta,mu
+; CHECK-NEXT:    vle1.v v0, (a0)
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 16 x i1> @llvm.riscv.vle1.nxv16i1(<vscale x 16 x i1>* %0, i32 %1)
+  ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 32 x i1> @llvm.riscv.vle1.nxv32i1(<vscale x 32 x i1>*, i32);
+
+define <vscale x 32 x i1> @intrinsic_vle1_v_nxv32i1(<vscale x 32 x i1>* %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vle1_v_nxv32i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a1, a1, e8,m4,ta,mu
+; CHECK-NEXT:    vle1.v v0, (a0)
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 32 x i1> @llvm.riscv.vle1.nxv32i1(<vscale x 32 x i1>* %0, i32 %1)
+  ret <vscale x 32 x i1> %a
+}
+
+declare <vscale x 64 x i1> @llvm.riscv.vle1.nxv64i1(<vscale x 64 x i1>*, i32);
+
+define <vscale x 64 x i1> @intrinsic_vle1_v_nxv64i1(<vscale x 64 x i1>* %0, i32 %1) nounwind {
+; CHECK-LABEL: intrinsic_vle1_v_nxv64i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a1, a1, e8,m8,ta,mu
+; CHECK-NEXT:    vle1.v v0, (a0)
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 64 x i1> @llvm.riscv.vle1.nxv64i1(<vscale x 64 x i1>* %0, i32 %1)
+  ret <vscale x 64 x i1> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vle1-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vle1-rv64.ll
new file mode 100644
index 000000000000..46c91f5f6b39
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vle1-rv64.ll
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -verify-machineinstrs \
+; RUN:   --riscv-no-aliases < %s | FileCheck %s
+
+declare <vscale x 1 x i1> @llvm.riscv.vle1.nxv1i1(<vscale x 1 x i1>*, i64);
+
+define <vscale x 1 x i1> @intrinsic_vle1_v_nxv1i1(<vscale x 1 x i1>* %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vle1_v_nxv1i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a1, a1, e8,mf8,ta,mu
+; CHECK-NEXT:    vle1.v v0, (a0)
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 1 x i1> @llvm.riscv.vle1.nxv1i1(<vscale x 1 x i1>* %0, i64 %1)
+  ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vle1.nxv2i1(<vscale x 2 x i1>*, i64);
+
+define <vscale x 2 x i1> @intrinsic_vle1_v_nxv2i1(<vscale x 2 x i1>* %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vle1_v_nxv2i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a1, a1, e8,mf4,ta,mu
+; CHECK-NEXT:    vle1.v v0, (a0)
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 2 x i1> @llvm.riscv.vle1.nxv2i1(<vscale x 2 x i1>* %0, i64 %1)
+  ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vle1.nxv4i1(<vscale x 4 x i1>*, i64);
+
+define <vscale x 4 x i1> @intrinsic_vle1_v_nxv4i1(<vscale x 4 x i1>* %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vle1_v_nxv4i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a1, a1, e8,mf2,ta,mu
+; CHECK-NEXT:    vle1.v v0, (a0)
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 4 x i1> @llvm.riscv.vle1.nxv4i1(<vscale x 4 x i1>* %0, i64 %1)
+  ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vle1.nxv8i1(<vscale x 8 x i1>*, i64);
+
+define <vscale x 8 x i1> @intrinsic_vle1_v_nxv8i1(<vscale x 8 x i1>* %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vle1_v_nxv8i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a1, a1, e8,m1,ta,mu
+; CHECK-NEXT:    vle1.v v0, (a0)
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 8 x i1> @llvm.riscv.vle1.nxv8i1(<vscale x 8 x i1>* %0, i64 %1)
+  ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vle1.nxv16i1(<vscale x 16 x i1>*, i64);
+
+define <vscale x 16 x i1> @intrinsic_vle1_v_nxv16i1(<vscale x 16 x i1>* %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vle1_v_nxv16i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a1, a1, e8,m2,ta,mu
+; CHECK-NEXT:    vle1.v v0, (a0)
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 16 x i1> @llvm.riscv.vle1.nxv16i1(<vscale x 16 x i1>* %0, i64 %1)
+  ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 32 x i1> @llvm.riscv.vle1.nxv32i1(<vscale x 32 x i1>*, i64);
+
+define <vscale x 32 x i1> @intrinsic_vle1_v_nxv32i1(<vscale x 32 x i1>* %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vle1_v_nxv32i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a1, a1, e8,m4,ta,mu
+; CHECK-NEXT:    vle1.v v0, (a0)
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 32 x i1> @llvm.riscv.vle1.nxv32i1(<vscale x 32 x i1>* %0, i64 %1)
+  ret <vscale x 32 x i1> %a
+}
+
+declare <vscale x 64 x i1> @llvm.riscv.vle1.nxv64i1(<vscale x 64 x i1>*, i64);
+
+define <vscale x 64 x i1> @intrinsic_vle1_v_nxv64i1(<vscale x 64 x i1>* %0, i64 %1) nounwind {
+; CHECK-LABEL: intrinsic_vle1_v_nxv64i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a1, a1, e8,m8,ta,mu
+; CHECK-NEXT:    vle1.v v0, (a0)
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  %a = call <vscale x 64 x i1> @llvm.riscv.vle1.nxv64i1(<vscale x 64 x i1>* %0, i64 %1)
+  ret <vscale x 64 x i1> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vse1-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vse1-rv32.ll
new file mode 100644
index 000000000000..d94125d7e6ad
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vse1-rv32.ll
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs \
+; RUN:   --riscv-no-aliases < %s | FileCheck %s
+
+declare void @llvm.riscv.vse1.nxv1i1(<vscale x 1 x i1>, <vscale x 1 x i1>*, i32);
+
+define void @intrinsic_vse1_v_nxv1i1(<vscale x 1 x i1> %0, <vscale x 1 x i1>* %1, i32 %2) nounwind {
+; CHECK-LABEL: intrinsic_vse1_v_nxv1i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a1, a1, e8,mf8,ta,mu
+; CHECK-NEXT:    vse1.v v0, (a0)
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  call void @llvm.riscv.vse1.nxv1i1(<vscale x 1 x i1> %0, <vscale x 1 x i1>* %1, i32 %2)
+  ret void
+}
+
+declare void @llvm.riscv.vse1.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>*, i32);
+
+define void @intrinsic_vse1_v_nxv2i1(<vscale x 2 x i1> %0, <vscale x 2 x i1>* %1, i32 %2) nounwind {
+; CHECK-LABEL: intrinsic_vse1_v_nxv2i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a1, a1, e8,mf4,ta,mu
+; CHECK-NEXT:    vse1.v v0, (a0)
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  call void @llvm.riscv.vse1.nxv2i1(<vscale x 2 x i1> %0, <vscale x 2 x i1>* %1, i32 %2)
+  ret void
+}
+
+declare void @llvm.riscv.vse1.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>*, i32);
+
+define void @intrinsic_vse1_v_nxv4i1(<vscale x 4 x i1> %0, <vscale x 4 x i1>* %1, i32 %2) nounwind {
+; CHECK-LABEL: intrinsic_vse1_v_nxv4i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a1, a1, e8,mf2,ta,mu
+; CHECK-NEXT:    vse1.v v0, (a0)
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  call void @llvm.riscv.vse1.nxv4i1(<vscale x 4 x i1> %0, <vscale x 4 x i1>* %1, i32 %2)
+  ret void
+}
+
+declare void @llvm.riscv.vse1.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>*, i32);
+
+define void @intrinsic_vse1_v_nxv8i1(<vscale x 8 x i1> %0, <vscale x 8 x i1>* %1, i32 %2) nounwind {
+; CHECK-LABEL: intrinsic_vse1_v_nxv8i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a1, a1, e8,m1,ta,mu
+; CHECK-NEXT:    vse1.v v0, (a0)
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  call void @llvm.riscv.vse1.nxv8i1(<vscale x 8 x i1> %0, <vscale x 8 x i1>* %1, i32 %2)
+  ret void
+}
+
+declare void @llvm.riscv.vse1.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>*, i32);
+
+define void @intrinsic_vse1_v_nxv16i1(<vscale x 16 x i1> %0, <vscale x 16 x i1>* %1, i32 %2) nounwind {
+; CHECK-LABEL: intrinsic_vse1_v_nxv16i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a1, a1, e8,m2,ta,mu
+; CHECK-NEXT:    vse1.v v0, (a0)
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  call void @llvm.riscv.vse1.nxv16i1(<vscale x 16 x i1> %0, <vscale x 16 x i1>* %1, i32 %2)
+  ret void
+}
+
+declare void @llvm.riscv.vse1.nxv32i1(<vscale x 32 x i1>, <vscale x 32 x i1>*, i32);
+
+define void @intrinsic_vse1_v_nxv32i1(<vscale x 32 x i1> %0, <vscale x 32 x i1>* %1, i32 %2) nounwind {
+; CHECK-LABEL: intrinsic_vse1_v_nxv32i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a1, a1, e8,m4,ta,mu
+; CHECK-NEXT:    vse1.v v0, (a0)
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  call void @llvm.riscv.vse1.nxv32i1(<vscale x 32 x i1> %0, <vscale x 32 x i1>* %1, i32 %2)
+  ret void
+}
+
+declare void @llvm.riscv.vse1.nxv64i1(<vscale x 64 x i1>, <vscale x 64 x i1>*, i32);
+
+define void @intrinsic_vse1_v_nxv64i1(<vscale x 64 x i1> %0, <vscale x 64 x i1>* %1, i32 %2) nounwind {
+; CHECK-LABEL: intrinsic_vse1_v_nxv64i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a1, a1, e8,m8,ta,mu
+; CHECK-NEXT:    vse1.v v0, (a0)
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  call void @llvm.riscv.vse1.nxv64i1(<vscale x 64 x i1> %0, <vscale x 64 x i1>* %1, i32 %2)
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vse1-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vse1-rv64.ll
new file mode 100644
index 000000000000..48d4585c01cd
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vse1-rv64.ll
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -verify-machineinstrs \
+; RUN:   --riscv-no-aliases < %s | FileCheck %s
+
+declare void @llvm.riscv.vse1.nxv1i1(<vscale x 1 x i1>, <vscale x 1 x i1>*, i64);
+
+define void @intrinsic_vse1_v_nxv1i1(<vscale x 1 x i1> %0, <vscale x 1 x i1>* %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vse1_v_nxv1i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a1, a1, e8,mf8,ta,mu
+; CHECK-NEXT:    vse1.v v0, (a0)
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  call void @llvm.riscv.vse1.nxv1i1(<vscale x 1 x i1> %0, <vscale x 1 x i1>* %1, i64 %2)
+  ret void
+}
+
+declare void @llvm.riscv.vse1.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>*, i64);
+
+define void @intrinsic_vse1_v_nxv2i1(<vscale x 2 x i1> %0, <vscale x 2 x i1>* %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vse1_v_nxv2i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a1, a1, e8,mf4,ta,mu
+; CHECK-NEXT:    vse1.v v0, (a0)
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  call void @llvm.riscv.vse1.nxv2i1(<vscale x 2 x i1> %0, <vscale x 2 x i1>* %1, i64 %2)
+  ret void
+}
+
+declare void @llvm.riscv.vse1.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>*, i64);
+
+define void @intrinsic_vse1_v_nxv4i1(<vscale x 4 x i1> %0, <vscale x 4 x i1>* %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vse1_v_nxv4i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a1, a1, e8,mf2,ta,mu
+; CHECK-NEXT:    vse1.v v0, (a0)
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  call void @llvm.riscv.vse1.nxv4i1(<vscale x 4 x i1> %0, <vscale x 4 x i1>* %1, i64 %2)
+  ret void
+}
+
+declare void @llvm.riscv.vse1.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>*, i64);
+
+define void @intrinsic_vse1_v_nxv8i1(<vscale x 8 x i1> %0, <vscale x 8 x i1>* %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vse1_v_nxv8i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a1, a1, e8,m1,ta,mu
+; CHECK-NEXT:    vse1.v v0, (a0)
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  call void @llvm.riscv.vse1.nxv8i1(<vscale x 8 x i1> %0, <vscale x 8 x i1>* %1, i64 %2)
+  ret void
+}
+
+declare void @llvm.riscv.vse1.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>*, i64);
+
+define void @intrinsic_vse1_v_nxv16i1(<vscale x 16 x i1> %0, <vscale x 16 x i1>* %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vse1_v_nxv16i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a1, a1, e8,m2,ta,mu
+; CHECK-NEXT:    vse1.v v0, (a0)
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  call void @llvm.riscv.vse1.nxv16i1(<vscale x 16 x i1> %0, <vscale x 16 x i1>* %1, i64 %2)
+  ret void
+}
+
+declare void @llvm.riscv.vse1.nxv32i1(<vscale x 32 x i1>, <vscale x 32 x i1>*, i64);
+
+define void @intrinsic_vse1_v_nxv32i1(<vscale x 32 x i1> %0, <vscale x 32 x i1>* %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vse1_v_nxv32i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a1, a1, e8,m4,ta,mu
+; CHECK-NEXT:    vse1.v v0, (a0)
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  call void @llvm.riscv.vse1.nxv32i1(<vscale x 32 x i1> %0, <vscale x 32 x i1>* %1, i64 %2)
+  ret void
+}
+
+declare void @llvm.riscv.vse1.nxv64i1(<vscale x 64 x i1>, <vscale x 64 x i1>*, i64);
+
+define void @intrinsic_vse1_v_nxv64i1(<vscale x 64 x i1> %0, <vscale x 64 x i1>* %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vse1_v_nxv64i1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a1, a1, e8,m8,ta,mu
+; CHECK-NEXT:    vse1.v v0, (a0)
+; CHECK-NEXT:    jalr zero, 0(ra)
+entry:
+  call void @llvm.riscv.vse1.nxv64i1(<vscale x 64 x i1> %0, <vscale x 64 x i1>* %1, i64 %2)
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/vfrece7-rv32.ll b/llvm/test/CodeGen/RISCV/vfrece7-rv32.ll
deleted file mode 100644
index 7a810f10d47d..000000000000
--- a/llvm/test/CodeGen/RISCV/vfrece7-rv32.ll
+++ /dev/null
@@ -1,602 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+experimental-v,+d,+experimental-zfh -verify-machineinstrs \
-; RUN:   --riscv-no-aliases < %s | FileCheck %s
-declare <vscale x 1 x half> @llvm.riscv.vfrece7.nxv1f16(
-  <vscale x 1 x half>,
-  i32);
-
-define <vscale x 1 x half> @intrinsic_vfrece7_v_nxv1f16_nxv1f16(<vscale x 1 x half> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_v_nxv1f16_nxv1f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,mf4,ta,mu
-; CHECK-NEXT:    vfrece7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vfrece7.nxv1f16(
-    <vscale x 1 x half> %0,
-    i32 %1)
-
-  ret <vscale x 1 x half> %a
-}
-
-declare <vscale x 1 x half> @llvm.riscv.vfrece7.mask.nxv1f16(
-  <vscale x 1 x half>,
-  <vscale x 1 x half>,
-  <vscale x 1 x i1>,
-  i32);
-
-define <vscale x 1 x half> @intrinsic_vfrece7_mask_v_nxv1f16_nxv1f16(<vscale x 1 x i1> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_mask_v_nxv1f16_nxv1f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,mf4,tu,mu
-; CHECK-NEXT:    vfrece7.v v8, v9, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vfrece7.mask.nxv1f16(
-    <vscale x 1 x half> %1,
-    <vscale x 1 x half> %2,
-    <vscale x 1 x i1> %0,
-    i32 %3)
-
-  ret <vscale x 1 x half> %a
-}
-
-declare <vscale x 2 x half> @llvm.riscv.vfrece7.nxv2f16(
-  <vscale x 2 x half>,
-  i32);
-
-define <vscale x 2 x half> @intrinsic_vfrece7_v_nxv2f16_nxv2f16(<vscale x 2 x half> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_v_nxv2f16_nxv2f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,mf2,ta,mu
-; CHECK-NEXT:    vfrece7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vfrece7.nxv2f16(
-    <vscale x 2 x half> %0,
-    i32 %1)
-
-  ret <vscale x 2 x half> %a
-}
-
-declare <vscale x 2 x half> @llvm.riscv.vfrece7.mask.nxv2f16(
-  <vscale x 2 x half>,
-  <vscale x 2 x half>,
-  <vscale x 2 x i1>,
-  i32);
-
-define <vscale x 2 x half> @intrinsic_vfrece7_mask_v_nxv2f16_nxv2f16(<vscale x 2 x i1> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_mask_v_nxv2f16_nxv2f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,mf2,tu,mu
-; CHECK-NEXT:    vfrece7.v v8, v9, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vfrece7.mask.nxv2f16(
-    <vscale x 2 x half> %1,
-    <vscale x 2 x half> %2,
-    <vscale x 2 x i1> %0,
-    i32 %3)
-
-  ret <vscale x 2 x half> %a
-}
-
-declare <vscale x 4 x half> @llvm.riscv.vfrece7.nxv4f16(
-  <vscale x 4 x half>,
-  i32);
-
-define <vscale x 4 x half> @intrinsic_vfrece7_v_nxv4f16_nxv4f16(<vscale x 4 x half> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_v_nxv4f16_nxv4f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,m1,ta,mu
-; CHECK-NEXT:    vfrece7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vfrece7.nxv4f16(
-    <vscale x 4 x half> %0,
-    i32 %1)
-
-  ret <vscale x 4 x half> %a
-}
-
-declare <vscale x 4 x half> @llvm.riscv.vfrece7.mask.nxv4f16(
-  <vscale x 4 x half>,
-  <vscale x 4 x half>,
-  <vscale x 4 x i1>,
-  i32);
-
-define <vscale x 4 x half> @intrinsic_vfrece7_mask_v_nxv4f16_nxv4f16(<vscale x 4 x i1> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_mask_v_nxv4f16_nxv4f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,m1,tu,mu
-; CHECK-NEXT:    vfrece7.v v8, v9, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vfrece7.mask.nxv4f16(
-    <vscale x 4 x half> %1,
-    <vscale x 4 x half> %2,
-    <vscale x 4 x i1> %0,
-    i32 %3)
-
-  ret <vscale x 4 x half> %a
-}
-
-declare <vscale x 8 x half> @llvm.riscv.vfrece7.nxv8f16(
-  <vscale x 8 x half>,
-  i32);
-
-define <vscale x 8 x half> @intrinsic_vfrece7_v_nxv8f16_nxv8f16(<vscale x 8 x half> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_v_nxv8f16_nxv8f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,m2,ta,mu
-; CHECK-NEXT:    vfrece7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vfrece7.nxv8f16(
-    <vscale x 8 x half> %0,
-    i32 %1)
-
-  ret <vscale x 8 x half> %a
-}
-
-declare <vscale x 8 x half> @llvm.riscv.vfrece7.mask.nxv8f16(
-  <vscale x 8 x half>,
-  <vscale x 8 x half>,
-  <vscale x 8 x i1>,
-  i32);
-
-define <vscale x 8 x half> @intrinsic_vfrece7_mask_v_nxv8f16_nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_mask_v_nxv8f16_nxv8f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,m2,tu,mu
-; CHECK-NEXT:    vfrece7.v v8, v10, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vfrece7.mask.nxv8f16(
-    <vscale x 8 x half> %1,
-    <vscale x 8 x half> %2,
-    <vscale x 8 x i1> %0,
-    i32 %3)
-
-  ret <vscale x 8 x half> %a
-}
-
-declare <vscale x 16 x half> @llvm.riscv.vfrece7.nxv16f16(
-  <vscale x 16 x half>,
-  i32);
-
-define <vscale x 16 x half> @intrinsic_vfrece7_v_nxv16f16_nxv16f16(<vscale x 16 x half> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_v_nxv16f16_nxv16f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,m4,ta,mu
-; CHECK-NEXT:    vfrece7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vfrece7.nxv16f16(
-    <vscale x 16 x half> %0,
-    i32 %1)
-
-  ret <vscale x 16 x half> %a
-}
-
-declare <vscale x 16 x half> @llvm.riscv.vfrece7.mask.nxv16f16(
-  <vscale x 16 x half>,
-  <vscale x 16 x half>,
-  <vscale x 16 x i1>,
-  i32);
-
-define <vscale x 16 x half> @intrinsic_vfrece7_mask_v_nxv16f16_nxv16f16(<vscale x 16 x i1> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_mask_v_nxv16f16_nxv16f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,m4,tu,mu
-; CHECK-NEXT:    vfrece7.v v8, v12, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vfrece7.mask.nxv16f16(
-    <vscale x 16 x half> %1,
-    <vscale x 16 x half> %2,
-    <vscale x 16 x i1> %0,
-    i32 %3)
-
-  ret <vscale x 16 x half> %a
-}
-
-declare <vscale x 32 x half> @llvm.riscv.vfrece7.nxv32f16(
-  <vscale x 32 x half>,
-  i32);
-
-define <vscale x 32 x half> @intrinsic_vfrece7_v_nxv32f16_nxv32f16(<vscale x 32 x half> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_v_nxv32f16_nxv32f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,m8,ta,mu
-; CHECK-NEXT:    vfrece7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vfrece7.nxv32f16(
-    <vscale x 32 x half> %0,
-    i32 %1)
-
-  ret <vscale x 32 x half> %a
-}
-
-declare <vscale x 32 x half> @llvm.riscv.vfrece7.mask.nxv32f16(
-  <vscale x 32 x half>,
-  <vscale x 32 x half>,
-  <vscale x 32 x i1>,
-  i32);
-
-define <vscale x 32 x half> @intrinsic_vfrece7_mask_v_nxv32f16_nxv32f16(<vscale x 32 x i1> %0, <vscale x 32 x half> %1, <vscale x 32 x half> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_mask_v_nxv32f16_nxv32f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,m8,tu,mu
-; CHECK-NEXT:    vfrece7.v v8, v16, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vfrece7.mask.nxv32f16(
-    <vscale x 32 x half> %1,
-    <vscale x 32 x half> %2,
-    <vscale x 32 x i1> %0,
-    i32 %3)
-
-  ret <vscale x 32 x half> %a
-}
-
-declare <vscale x 1 x float> @llvm.riscv.vfrece7.nxv1f32(
-  <vscale x 1 x float>,
-  i32);
-
-define <vscale x 1 x float> @intrinsic_vfrece7_v_nxv1f32_nxv1f32(<vscale x 1 x float> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_v_nxv1f32_nxv1f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,mf2,ta,mu
-; CHECK-NEXT:    vfrece7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vfrece7.nxv1f32(
-    <vscale x 1 x float> %0,
-    i32 %1)
-
-  ret <vscale x 1 x float> %a
-}
-
-declare <vscale x 1 x float> @llvm.riscv.vfrece7.mask.nxv1f32(
-  <vscale x 1 x float>,
-  <vscale x 1 x float>,
-  <vscale x 1 x i1>,
-  i32);
-
-define <vscale x 1 x float> @intrinsic_vfrece7_mask_v_nxv1f32_nxv1f32(<vscale x 1 x i1> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_mask_v_nxv1f32_nxv1f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,mf2,tu,mu
-; CHECK-NEXT:    vfrece7.v v8, v9, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vfrece7.mask.nxv1f32(
-    <vscale x 1 x float> %1,
-    <vscale x 1 x float> %2,
-    <vscale x 1 x i1> %0,
-    i32 %3)
-
-  ret <vscale x 1 x float> %a
-}
-
-declare <vscale x 2 x float> @llvm.riscv.vfrece7.nxv2f32(
-  <vscale x 2 x float>,
-  i32);
-
-define <vscale x 2 x float> @intrinsic_vfrece7_v_nxv2f32_nxv2f32(<vscale x 2 x float> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_v_nxv2f32_nxv2f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,m1,ta,mu
-; CHECK-NEXT:    vfrece7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vfrece7.nxv2f32(
-    <vscale x 2 x float> %0,
-    i32 %1)
-
-  ret <vscale x 2 x float> %a
-}
-
-declare <vscale x 2 x float> @llvm.riscv.vfrece7.mask.nxv2f32(
-  <vscale x 2 x float>,
-  <vscale x 2 x float>,
-  <vscale x 2 x i1>,
-  i32);
-
-define <vscale x 2 x float> @intrinsic_vfrece7_mask_v_nxv2f32_nxv2f32(<vscale x 2 x i1> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_mask_v_nxv2f32_nxv2f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,m1,tu,mu
-; CHECK-NEXT:    vfrece7.v v8, v9, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vfrece7.mask.nxv2f32(
-    <vscale x 2 x float> %1,
-    <vscale x 2 x float> %2,
-    <vscale x 2 x i1> %0,
-    i32 %3)
-
-  ret <vscale x 2 x float> %a
-}
-
-declare <vscale x 4 x float> @llvm.riscv.vfrece7.nxv4f32(
-  <vscale x 4 x float>,
-  i32);
-
-define <vscale x 4 x float> @intrinsic_vfrece7_v_nxv4f32_nxv4f32(<vscale x 4 x float> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_v_nxv4f32_nxv4f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,m2,ta,mu
-; CHECK-NEXT:    vfrece7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vfrece7.nxv4f32(
-    <vscale x 4 x float> %0,
-    i32 %1)
-
-  ret <vscale x 4 x float> %a
-}
-
-declare <vscale x 4 x float> @llvm.riscv.vfrece7.mask.nxv4f32(
-  <vscale x 4 x float>,
-  <vscale x 4 x float>,
-  <vscale x 4 x i1>,
-  i32);
-
-define <vscale x 4 x float> @intrinsic_vfrece7_mask_v_nxv4f32_nxv4f32(<vscale x 4 x i1> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_mask_v_nxv4f32_nxv4f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,m2,tu,mu
-; CHECK-NEXT:    vfrece7.v v8, v10, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vfrece7.mask.nxv4f32(
-    <vscale x 4 x float> %1,
-    <vscale x 4 x float> %2,
-    <vscale x 4 x i1> %0,
-    i32 %3)
-
-  ret <vscale x 4 x float> %a
-}
-
-declare <vscale x 8 x float> @llvm.riscv.vfrece7.nxv8f32(
-  <vscale x 8 x float>,
-  i32);
-
-define <vscale x 8 x float> @intrinsic_vfrece7_v_nxv8f32_nxv8f32(<vscale x 8 x float> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_v_nxv8f32_nxv8f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,m4,ta,mu
-; CHECK-NEXT:    vfrece7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vfrece7.nxv8f32(
-    <vscale x 8 x float> %0,
-    i32 %1)
-
-  ret <vscale x 8 x float> %a
-}
-
-declare <vscale x 8 x float> @llvm.riscv.vfrece7.mask.nxv8f32(
-  <vscale x 8 x float>,
-  <vscale x 8 x float>,
-  <vscale x 8 x i1>,
-  i32);
-
-define <vscale x 8 x float> @intrinsic_vfrece7_mask_v_nxv8f32_nxv8f32(<vscale x 8 x i1> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_mask_v_nxv8f32_nxv8f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,m4,tu,mu
-; CHECK-NEXT:    vfrece7.v v8, v12, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vfrece7.mask.nxv8f32(
-    <vscale x 8 x float> %1,
-    <vscale x 8 x float> %2,
-    <vscale x 8 x i1> %0,
-    i32 %3)
-
-  ret <vscale x 8 x float> %a
-}
-
-declare <vscale x 16 x float> @llvm.riscv.vfrece7.nxv16f32(
-  <vscale x 16 x float>,
-  i32);
-
-define <vscale x 16 x float> @intrinsic_vfrece7_v_nxv16f32_nxv16f32(<vscale x 16 x float> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_v_nxv16f32_nxv16f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,m8,ta,mu
-; CHECK-NEXT:    vfrece7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vfrece7.nxv16f32(
-    <vscale x 16 x float> %0,
-    i32 %1)
-
-  ret <vscale x 16 x float> %a
-}
-
-declare <vscale x 16 x float> @llvm.riscv.vfrece7.mask.nxv16f32(
-  <vscale x 16 x float>,
-  <vscale x 16 x float>,
-  <vscale x 16 x i1>,
-  i32);
-
-define <vscale x 16 x float> @intrinsic_vfrece7_mask_v_nxv16f32_nxv16f32(<vscale x 16 x i1> %0, <vscale x 16 x float> %1, <vscale x 16 x float> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_mask_v_nxv16f32_nxv16f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,m8,tu,mu
-; CHECK-NEXT:    vfrece7.v v8, v16, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vfrece7.mask.nxv16f32(
-    <vscale x 16 x float> %1,
-    <vscale x 16 x float> %2,
-    <vscale x 16 x i1> %0,
-    i32 %3)
-
-  ret <vscale x 16 x float> %a
-}
-
-declare <vscale x 1 x double> @llvm.riscv.vfrece7.nxv1f64(
-  <vscale x 1 x double>,
-  i32);
-
-define <vscale x 1 x double> @intrinsic_vfrece7_v_nxv1f64_nxv1f64(<vscale x 1 x double> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_v_nxv1f64_nxv1f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e64,m1,ta,mu
-; CHECK-NEXT:    vfrece7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vfrece7.nxv1f64(
-    <vscale x 1 x double> %0,
-    i32 %1)
-
-  ret <vscale x 1 x double> %a
-}
-
-declare <vscale x 1 x double> @llvm.riscv.vfrece7.mask.nxv1f64(
-  <vscale x 1 x double>,
-  <vscale x 1 x double>,
-  <vscale x 1 x i1>,
-  i32);
-
-define <vscale x 1 x double> @intrinsic_vfrece7_mask_v_nxv1f64_nxv1f64(<vscale x 1 x i1> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_mask_v_nxv1f64_nxv1f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e64,m1,tu,mu
-; CHECK-NEXT:    vfrece7.v v8, v9, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vfrece7.mask.nxv1f64(
-    <vscale x 1 x double> %1,
-    <vscale x 1 x double> %2,
-    <vscale x 1 x i1> %0,
-    i32 %3)
-
-  ret <vscale x 1 x double> %a
-}
-
-declare <vscale x 2 x double> @llvm.riscv.vfrece7.nxv2f64(
-  <vscale x 2 x double>,
-  i32);
-
-define <vscale x 2 x double> @intrinsic_vfrece7_v_nxv2f64_nxv2f64(<vscale x 2 x double> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_v_nxv2f64_nxv2f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e64,m2,ta,mu
-; CHECK-NEXT:    vfrece7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vfrece7.nxv2f64(
-    <vscale x 2 x double> %0,
-    i32 %1)
-
-  ret <vscale x 2 x double> %a
-}
-
-declare <vscale x 2 x double> @llvm.riscv.vfrece7.mask.nxv2f64(
-  <vscale x 2 x double>,
-  <vscale x 2 x double>,
-  <vscale x 2 x i1>,
-  i32);
-
-define <vscale x 2 x double> @intrinsic_vfrece7_mask_v_nxv2f64_nxv2f64(<vscale x 2 x i1> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_mask_v_nxv2f64_nxv2f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e64,m2,tu,mu
-; CHECK-NEXT:    vfrece7.v v8, v10, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vfrece7.mask.nxv2f64(
-    <vscale x 2 x double> %1,
-    <vscale x 2 x double> %2,
-    <vscale x 2 x i1> %0,
-    i32 %3)
-
-  ret <vscale x 2 x double> %a
-}
-
-declare <vscale x 4 x double> @llvm.riscv.vfrece7.nxv4f64(
-  <vscale x 4 x double>,
-  i32);
-
-define <vscale x 4 x double> @intrinsic_vfrece7_v_nxv4f64_nxv4f64(<vscale x 4 x double> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_v_nxv4f64_nxv4f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e64,m4,ta,mu
-; CHECK-NEXT:    vfrece7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vfrece7.nxv4f64(
-    <vscale x 4 x double> %0,
-    i32 %1)
-
-  ret <vscale x 4 x double> %a
-}
-
-declare <vscale x 4 x double> @llvm.riscv.vfrece7.mask.nxv4f64(
-  <vscale x 4 x double>,
-  <vscale x 4 x double>,
-  <vscale x 4 x i1>,
-  i32);
-
-define <vscale x 4 x double> @intrinsic_vfrece7_mask_v_nxv4f64_nxv4f64(<vscale x 4 x i1> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_mask_v_nxv4f64_nxv4f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e64,m4,tu,mu
-; CHECK-NEXT:    vfrece7.v v8, v12, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vfrece7.mask.nxv4f64(
-    <vscale x 4 x double> %1,
-    <vscale x 4 x double> %2,
-    <vscale x 4 x i1> %0,
-    i32 %3)
-
-  ret <vscale x 4 x double> %a
-}
-
-declare <vscale x 8 x double> @llvm.riscv.vfrece7.nxv8f64(
-  <vscale x 8 x double>,
-  i32);
-
-define <vscale x 8 x double> @intrinsic_vfrece7_v_nxv8f64_nxv8f64(<vscale x 8 x double> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_v_nxv8f64_nxv8f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e64,m8,ta,mu
-; CHECK-NEXT:    vfrece7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vfrece7.nxv8f64(
-    <vscale x 8 x double> %0,
-    i32 %1)
-
-  ret <vscale x 8 x double> %a
-}
-
-declare <vscale x 8 x double> @llvm.riscv.vfrece7.mask.nxv8f64(
-  <vscale x 8 x double>,
-  <vscale x 8 x double>,
-  <vscale x 8 x i1>,
-  i32);
-
-define <vscale x 8 x double> @intrinsic_vfrece7_mask_v_nxv8f64_nxv8f64(<vscale x 8 x i1> %0, <vscale x 8 x double> %1, <vscale x 8 x double> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_mask_v_nxv8f64_nxv8f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e64,m8,tu,mu
-; CHECK-NEXT:    vfrece7.v v8, v16, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vfrece7.mask.nxv8f64(
-    <vscale x 8 x double> %1,
-    <vscale x 8 x double> %2,
-    <vscale x 8 x i1> %0,
-    i32 %3)
-
-  ret <vscale x 8 x double> %a
-}
diff --git a/llvm/test/CodeGen/RISCV/vfrece7-rv64.ll b/llvm/test/CodeGen/RISCV/vfrece7-rv64.ll
deleted file mode 100644
index 3af3fe4078c5..000000000000
--- a/llvm/test/CodeGen/RISCV/vfrece7-rv64.ll
+++ /dev/null
@@ -1,602 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+experimental-v,+d,+experimental-zfh -verify-machineinstrs \
-; RUN:   --riscv-no-aliases < %s | FileCheck %s
-declare <vscale x 1 x half> @llvm.riscv.vfrece7.nxv1f16(
-  <vscale x 1 x half>,
-  i64);
-
-define <vscale x 1 x half> @intrinsic_vfrece7_v_nxv1f16_nxv1f16(<vscale x 1 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_v_nxv1f16_nxv1f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,mf4,ta,mu
-; CHECK-NEXT:    vfrece7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vfrece7.nxv1f16(
-    <vscale x 1 x half> %0,
-    i64 %1)
-
-  ret <vscale x 1 x half> %a
-}
-
-declare <vscale x 1 x half> @llvm.riscv.vfrece7.mask.nxv1f16(
-  <vscale x 1 x half>,
-  <vscale x 1 x half>,
-  <vscale x 1 x i1>,
-  i64);
-
-define <vscale x 1 x half> @intrinsic_vfrece7_mask_v_nxv1f16_nxv1f16(<vscale x 1 x i1> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_mask_v_nxv1f16_nxv1f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,mf4,tu,mu
-; CHECK-NEXT:    vfrece7.v v8, v9, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vfrece7.mask.nxv1f16(
-    <vscale x 1 x half> %1,
-    <vscale x 1 x half> %2,
-    <vscale x 1 x i1> %0,
-    i64 %3)
-
-  ret <vscale x 1 x half> %a
-}
-
-declare <vscale x 2 x half> @llvm.riscv.vfrece7.nxv2f16(
-  <vscale x 2 x half>,
-  i64);
-
-define <vscale x 2 x half> @intrinsic_vfrece7_v_nxv2f16_nxv2f16(<vscale x 2 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_v_nxv2f16_nxv2f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,mf2,ta,mu
-; CHECK-NEXT:    vfrece7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vfrece7.nxv2f16(
-    <vscale x 2 x half> %0,
-    i64 %1)
-
-  ret <vscale x 2 x half> %a
-}
-
-declare <vscale x 2 x half> @llvm.riscv.vfrece7.mask.nxv2f16(
-  <vscale x 2 x half>,
-  <vscale x 2 x half>,
-  <vscale x 2 x i1>,
-  i64);
-
-define <vscale x 2 x half> @intrinsic_vfrece7_mask_v_nxv2f16_nxv2f16(<vscale x 2 x i1> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_mask_v_nxv2f16_nxv2f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,mf2,tu,mu
-; CHECK-NEXT:    vfrece7.v v8, v9, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vfrece7.mask.nxv2f16(
-    <vscale x 2 x half> %1,
-    <vscale x 2 x half> %2,
-    <vscale x 2 x i1> %0,
-    i64 %3)
-
-  ret <vscale x 2 x half> %a
-}
-
-declare <vscale x 4 x half> @llvm.riscv.vfrece7.nxv4f16(
-  <vscale x 4 x half>,
-  i64);
-
-define <vscale x 4 x half> @intrinsic_vfrece7_v_nxv4f16_nxv4f16(<vscale x 4 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_v_nxv4f16_nxv4f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,m1,ta,mu
-; CHECK-NEXT:    vfrece7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vfrece7.nxv4f16(
-    <vscale x 4 x half> %0,
-    i64 %1)
-
-  ret <vscale x 4 x half> %a
-}
-
-declare <vscale x 4 x half> @llvm.riscv.vfrece7.mask.nxv4f16(
-  <vscale x 4 x half>,
-  <vscale x 4 x half>,
-  <vscale x 4 x i1>,
-  i64);
-
-define <vscale x 4 x half> @intrinsic_vfrece7_mask_v_nxv4f16_nxv4f16(<vscale x 4 x i1> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_mask_v_nxv4f16_nxv4f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,m1,tu,mu
-; CHECK-NEXT:    vfrece7.v v8, v9, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vfrece7.mask.nxv4f16(
-    <vscale x 4 x half> %1,
-    <vscale x 4 x half> %2,
-    <vscale x 4 x i1> %0,
-    i64 %3)
-
-  ret <vscale x 4 x half> %a
-}
-
-declare <vscale x 8 x half> @llvm.riscv.vfrece7.nxv8f16(
-  <vscale x 8 x half>,
-  i64);
-
-define <vscale x 8 x half> @intrinsic_vfrece7_v_nxv8f16_nxv8f16(<vscale x 8 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_v_nxv8f16_nxv8f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,m2,ta,mu
-; CHECK-NEXT:    vfrece7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vfrece7.nxv8f16(
-    <vscale x 8 x half> %0,
-    i64 %1)
-
-  ret <vscale x 8 x half> %a
-}
-
-declare <vscale x 8 x half> @llvm.riscv.vfrece7.mask.nxv8f16(
-  <vscale x 8 x half>,
-  <vscale x 8 x half>,
-  <vscale x 8 x i1>,
-  i64);
-
-define <vscale x 8 x half> @intrinsic_vfrece7_mask_v_nxv8f16_nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_mask_v_nxv8f16_nxv8f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,m2,tu,mu
-; CHECK-NEXT:    vfrece7.v v8, v10, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vfrece7.mask.nxv8f16(
-    <vscale x 8 x half> %1,
-    <vscale x 8 x half> %2,
-    <vscale x 8 x i1> %0,
-    i64 %3)
-
-  ret <vscale x 8 x half> %a
-}
-
-declare <vscale x 16 x half> @llvm.riscv.vfrece7.nxv16f16(
-  <vscale x 16 x half>,
-  i64);
-
-define <vscale x 16 x half> @intrinsic_vfrece7_v_nxv16f16_nxv16f16(<vscale x 16 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_v_nxv16f16_nxv16f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,m4,ta,mu
-; CHECK-NEXT:    vfrece7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vfrece7.nxv16f16(
-    <vscale x 16 x half> %0,
-    i64 %1)
-
-  ret <vscale x 16 x half> %a
-}
-
-declare <vscale x 16 x half> @llvm.riscv.vfrece7.mask.nxv16f16(
-  <vscale x 16 x half>,
-  <vscale x 16 x half>,
-  <vscale x 16 x i1>,
-  i64);
-
-define <vscale x 16 x half> @intrinsic_vfrece7_mask_v_nxv16f16_nxv16f16(<vscale x 16 x i1> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_mask_v_nxv16f16_nxv16f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,m4,tu,mu
-; CHECK-NEXT:    vfrece7.v v8, v12, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vfrece7.mask.nxv16f16(
-    <vscale x 16 x half> %1,
-    <vscale x 16 x half> %2,
-    <vscale x 16 x i1> %0,
-    i64 %3)
-
-  ret <vscale x 16 x half> %a
-}
-
-declare <vscale x 32 x half> @llvm.riscv.vfrece7.nxv32f16(
-  <vscale x 32 x half>,
-  i64);
-
-define <vscale x 32 x half> @intrinsic_vfrece7_v_nxv32f16_nxv32f16(<vscale x 32 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_v_nxv32f16_nxv32f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,m8,ta,mu
-; CHECK-NEXT:    vfrece7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vfrece7.nxv32f16(
-    <vscale x 32 x half> %0,
-    i64 %1)
-
-  ret <vscale x 32 x half> %a
-}
-
-declare <vscale x 32 x half> @llvm.riscv.vfrece7.mask.nxv32f16(
-  <vscale x 32 x half>,
-  <vscale x 32 x half>,
-  <vscale x 32 x i1>,
-  i64);
-
-define <vscale x 32 x half> @intrinsic_vfrece7_mask_v_nxv32f16_nxv32f16(<vscale x 32 x i1> %0, <vscale x 32 x half> %1, <vscale x 32 x half> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_mask_v_nxv32f16_nxv32f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,m8,tu,mu
-; CHECK-NEXT:    vfrece7.v v8, v16, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vfrece7.mask.nxv32f16(
-    <vscale x 32 x half> %1,
-    <vscale x 32 x half> %2,
-    <vscale x 32 x i1> %0,
-    i64 %3)
-
-  ret <vscale x 32 x half> %a
-}
-
-declare <vscale x 1 x float> @llvm.riscv.vfrece7.nxv1f32(
-  <vscale x 1 x float>,
-  i64);
-
-define <vscale x 1 x float> @intrinsic_vfrece7_v_nxv1f32_nxv1f32(<vscale x 1 x float> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_v_nxv1f32_nxv1f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,mf2,ta,mu
-; CHECK-NEXT:    vfrece7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vfrece7.nxv1f32(
-    <vscale x 1 x float> %0,
-    i64 %1)
-
-  ret <vscale x 1 x float> %a
-}
-
-declare <vscale x 1 x float> @llvm.riscv.vfrece7.mask.nxv1f32(
-  <vscale x 1 x float>,
-  <vscale x 1 x float>,
-  <vscale x 1 x i1>,
-  i64);
-
-define <vscale x 1 x float> @intrinsic_vfrece7_mask_v_nxv1f32_nxv1f32(<vscale x 1 x i1> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_mask_v_nxv1f32_nxv1f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,mf2,tu,mu
-; CHECK-NEXT:    vfrece7.v v8, v9, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vfrece7.mask.nxv1f32(
-    <vscale x 1 x float> %1,
-    <vscale x 1 x float> %2,
-    <vscale x 1 x i1> %0,
-    i64 %3)
-
-  ret <vscale x 1 x float> %a
-}
-
-declare <vscale x 2 x float> @llvm.riscv.vfrece7.nxv2f32(
-  <vscale x 2 x float>,
-  i64);
-
-define <vscale x 2 x float> @intrinsic_vfrece7_v_nxv2f32_nxv2f32(<vscale x 2 x float> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_v_nxv2f32_nxv2f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,m1,ta,mu
-; CHECK-NEXT:    vfrece7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vfrece7.nxv2f32(
-    <vscale x 2 x float> %0,
-    i64 %1)
-
-  ret <vscale x 2 x float> %a
-}
-
-declare <vscale x 2 x float> @llvm.riscv.vfrece7.mask.nxv2f32(
-  <vscale x 2 x float>,
-  <vscale x 2 x float>,
-  <vscale x 2 x i1>,
-  i64);
-
-define <vscale x 2 x float> @intrinsic_vfrece7_mask_v_nxv2f32_nxv2f32(<vscale x 2 x i1> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_mask_v_nxv2f32_nxv2f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,m1,tu,mu
-; CHECK-NEXT:    vfrece7.v v8, v9, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vfrece7.mask.nxv2f32(
-    <vscale x 2 x float> %1,
-    <vscale x 2 x float> %2,
-    <vscale x 2 x i1> %0,
-    i64 %3)
-
-  ret <vscale x 2 x float> %a
-}
-
-declare <vscale x 4 x float> @llvm.riscv.vfrece7.nxv4f32(
-  <vscale x 4 x float>,
-  i64);
-
-define <vscale x 4 x float> @intrinsic_vfrece7_v_nxv4f32_nxv4f32(<vscale x 4 x float> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_v_nxv4f32_nxv4f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,m2,ta,mu
-; CHECK-NEXT:    vfrece7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vfrece7.nxv4f32(
-    <vscale x 4 x float> %0,
-    i64 %1)
-
-  ret <vscale x 4 x float> %a
-}
-
-declare <vscale x 4 x float> @llvm.riscv.vfrece7.mask.nxv4f32(
-  <vscale x 4 x float>,
-  <vscale x 4 x float>,
-  <vscale x 4 x i1>,
-  i64);
-
-define <vscale x 4 x float> @intrinsic_vfrece7_mask_v_nxv4f32_nxv4f32(<vscale x 4 x i1> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_mask_v_nxv4f32_nxv4f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,m2,tu,mu
-; CHECK-NEXT:    vfrece7.v v8, v10, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vfrece7.mask.nxv4f32(
-    <vscale x 4 x float> %1,
-    <vscale x 4 x float> %2,
-    <vscale x 4 x i1> %0,
-    i64 %3)
-
-  ret <vscale x 4 x float> %a
-}
-
-declare <vscale x 8 x float> @llvm.riscv.vfrece7.nxv8f32(
-  <vscale x 8 x float>,
-  i64);
-
-define <vscale x 8 x float> @intrinsic_vfrece7_v_nxv8f32_nxv8f32(<vscale x 8 x float> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_v_nxv8f32_nxv8f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,m4,ta,mu
-; CHECK-NEXT:    vfrece7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vfrece7.nxv8f32(
-    <vscale x 8 x float> %0,
-    i64 %1)
-
-  ret <vscale x 8 x float> %a
-}
-
-declare <vscale x 8 x float> @llvm.riscv.vfrece7.mask.nxv8f32(
-  <vscale x 8 x float>,
-  <vscale x 8 x float>,
-  <vscale x 8 x i1>,
-  i64);
-
-define <vscale x 8 x float> @intrinsic_vfrece7_mask_v_nxv8f32_nxv8f32(<vscale x 8 x i1> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_mask_v_nxv8f32_nxv8f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,m4,tu,mu
-; CHECK-NEXT:    vfrece7.v v8, v12, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vfrece7.mask.nxv8f32(
-    <vscale x 8 x float> %1,
-    <vscale x 8 x float> %2,
-    <vscale x 8 x i1> %0,
-    i64 %3)
-
-  ret <vscale x 8 x float> %a
-}
-
-declare <vscale x 16 x float> @llvm.riscv.vfrece7.nxv16f32(
-  <vscale x 16 x float>,
-  i64);
-
-define <vscale x 16 x float> @intrinsic_vfrece7_v_nxv16f32_nxv16f32(<vscale x 16 x float> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_v_nxv16f32_nxv16f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,m8,ta,mu
-; CHECK-NEXT:    vfrece7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vfrece7.nxv16f32(
-    <vscale x 16 x float> %0,
-    i64 %1)
-
-  ret <vscale x 16 x float> %a
-}
-
-declare <vscale x 16 x float> @llvm.riscv.vfrece7.mask.nxv16f32(
-  <vscale x 16 x float>,
-  <vscale x 16 x float>,
-  <vscale x 16 x i1>,
-  i64);
-
-define <vscale x 16 x float> @intrinsic_vfrece7_mask_v_nxv16f32_nxv16f32(<vscale x 16 x i1> %0, <vscale x 16 x float> %1, <vscale x 16 x float> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_mask_v_nxv16f32_nxv16f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,m8,tu,mu
-; CHECK-NEXT:    vfrece7.v v8, v16, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vfrece7.mask.nxv16f32(
-    <vscale x 16 x float> %1,
-    <vscale x 16 x float> %2,
-    <vscale x 16 x i1> %0,
-    i64 %3)
-
-  ret <vscale x 16 x float> %a
-}
-
-declare <vscale x 1 x double> @llvm.riscv.vfrece7.nxv1f64(
-  <vscale x 1 x double>,
-  i64);
-
-define <vscale x 1 x double> @intrinsic_vfrece7_v_nxv1f64_nxv1f64(<vscale x 1 x double> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_v_nxv1f64_nxv1f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e64,m1,ta,mu
-; CHECK-NEXT:    vfrece7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vfrece7.nxv1f64(
-    <vscale x 1 x double> %0,
-    i64 %1)
-
-  ret <vscale x 1 x double> %a
-}
-
-declare <vscale x 1 x double> @llvm.riscv.vfrece7.mask.nxv1f64(
-  <vscale x 1 x double>,
-  <vscale x 1 x double>,
-  <vscale x 1 x i1>,
-  i64);
-
-define <vscale x 1 x double> @intrinsic_vfrece7_mask_v_nxv1f64_nxv1f64(<vscale x 1 x i1> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_mask_v_nxv1f64_nxv1f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e64,m1,tu,mu
-; CHECK-NEXT:    vfrece7.v v8, v9, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vfrece7.mask.nxv1f64(
-    <vscale x 1 x double> %1,
-    <vscale x 1 x double> %2,
-    <vscale x 1 x i1> %0,
-    i64 %3)
-
-  ret <vscale x 1 x double> %a
-}
-
-declare <vscale x 2 x double> @llvm.riscv.vfrece7.nxv2f64(
-  <vscale x 2 x double>,
-  i64);
-
-define <vscale x 2 x double> @intrinsic_vfrece7_v_nxv2f64_nxv2f64(<vscale x 2 x double> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_v_nxv2f64_nxv2f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e64,m2,ta,mu
-; CHECK-NEXT:    vfrece7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vfrece7.nxv2f64(
-    <vscale x 2 x double> %0,
-    i64 %1)
-
-  ret <vscale x 2 x double> %a
-}
-
-declare <vscale x 2 x double> @llvm.riscv.vfrece7.mask.nxv2f64(
-  <vscale x 2 x double>,
-  <vscale x 2 x double>,
-  <vscale x 2 x i1>,
-  i64);
-
-define <vscale x 2 x double> @intrinsic_vfrece7_mask_v_nxv2f64_nxv2f64(<vscale x 2 x i1> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_mask_v_nxv2f64_nxv2f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e64,m2,tu,mu
-; CHECK-NEXT:    vfrece7.v v8, v10, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vfrece7.mask.nxv2f64(
-    <vscale x 2 x double> %1,
-    <vscale x 2 x double> %2,
-    <vscale x 2 x i1> %0,
-    i64 %3)
-
-  ret <vscale x 2 x double> %a
-}
-
-declare <vscale x 4 x double> @llvm.riscv.vfrece7.nxv4f64(
-  <vscale x 4 x double>,
-  i64);
-
-define <vscale x 4 x double> @intrinsic_vfrece7_v_nxv4f64_nxv4f64(<vscale x 4 x double> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_v_nxv4f64_nxv4f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e64,m4,ta,mu
-; CHECK-NEXT:    vfrece7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vfrece7.nxv4f64(
-    <vscale x 4 x double> %0,
-    i64 %1)
-
-  ret <vscale x 4 x double> %a
-}
-
-declare <vscale x 4 x double> @llvm.riscv.vfrece7.mask.nxv4f64(
-  <vscale x 4 x double>,
-  <vscale x 4 x double>,
-  <vscale x 4 x i1>,
-  i64);
-
-define <vscale x 4 x double> @intrinsic_vfrece7_mask_v_nxv4f64_nxv4f64(<vscale x 4 x i1> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_mask_v_nxv4f64_nxv4f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e64,m4,tu,mu
-; CHECK-NEXT:    vfrece7.v v8, v12, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vfrece7.mask.nxv4f64(
-    <vscale x 4 x double> %1,
-    <vscale x 4 x double> %2,
-    <vscale x 4 x i1> %0,
-    i64 %3)
-
-  ret <vscale x 4 x double> %a
-}
-
-declare <vscale x 8 x double> @llvm.riscv.vfrece7.nxv8f64(
-  <vscale x 8 x double>,
-  i64);
-
-define <vscale x 8 x double> @intrinsic_vfrece7_v_nxv8f64_nxv8f64(<vscale x 8 x double> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_v_nxv8f64_nxv8f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e64,m8,ta,mu
-; CHECK-NEXT:    vfrece7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vfrece7.nxv8f64(
-    <vscale x 8 x double> %0,
-    i64 %1)
-
-  ret <vscale x 8 x double> %a
-}
-
-declare <vscale x 8 x double> @llvm.riscv.vfrece7.mask.nxv8f64(
-  <vscale x 8 x double>,
-  <vscale x 8 x double>,
-  <vscale x 8 x i1>,
-  i64);
-
-define <vscale x 8 x double> @intrinsic_vfrece7_mask_v_nxv8f64_nxv8f64(<vscale x 8 x i1> %0, <vscale x 8 x double> %1, <vscale x 8 x double> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrece7_mask_v_nxv8f64_nxv8f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e64,m8,tu,mu
-; CHECK-NEXT:    vfrece7.v v8, v16, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vfrece7.mask.nxv8f64(
-    <vscale x 8 x double> %1,
-    <vscale x 8 x double> %2,
-    <vscale x 8 x i1> %0,
-    i64 %3)
-
-  ret <vscale x 8 x double> %a
-}
diff --git a/llvm/test/CodeGen/RISCV/vfrsqrte7-rv32.ll b/llvm/test/CodeGen/RISCV/vfrsqrte7-rv32.ll
deleted file mode 100644
index 083b411ef3c8..000000000000
--- a/llvm/test/CodeGen/RISCV/vfrsqrte7-rv32.ll
+++ /dev/null
@@ -1,602 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+experimental-v,+d,+experimental-zfh -verify-machineinstrs \
-; RUN:   --riscv-no-aliases < %s | FileCheck %s
-declare <vscale x 1 x half> @llvm.riscv.vfrsqrte7.nxv1f16(
-  <vscale x 1 x half>,
-  i32);
-
-define <vscale x 1 x half> @intrinsic_vfrsqrte7_v_nxv1f16_nxv1f16(<vscale x 1 x half> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_v_nxv1f16_nxv1f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,mf4,ta,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vfrsqrte7.nxv1f16(
-    <vscale x 1 x half> %0,
-    i32 %1)
-
-  ret <vscale x 1 x half> %a
-}
-
-declare <vscale x 1 x half> @llvm.riscv.vfrsqrte7.mask.nxv1f16(
-  <vscale x 1 x half>,
-  <vscale x 1 x half>,
-  <vscale x 1 x i1>,
-  i32);
-
-define <vscale x 1 x half> @intrinsic_vfrsqrte7_mask_v_nxv1f16_nxv1f16(<vscale x 1 x i1> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_mask_v_nxv1f16_nxv1f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,mf4,tu,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v9, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vfrsqrte7.mask.nxv1f16(
-    <vscale x 1 x half> %1,
-    <vscale x 1 x half> %2,
-    <vscale x 1 x i1> %0,
-    i32 %3)
-
-  ret <vscale x 1 x half> %a
-}
-
-declare <vscale x 2 x half> @llvm.riscv.vfrsqrte7.nxv2f16(
-  <vscale x 2 x half>,
-  i32);
-
-define <vscale x 2 x half> @intrinsic_vfrsqrte7_v_nxv2f16_nxv2f16(<vscale x 2 x half> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_v_nxv2f16_nxv2f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,mf2,ta,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vfrsqrte7.nxv2f16(
-    <vscale x 2 x half> %0,
-    i32 %1)
-
-  ret <vscale x 2 x half> %a
-}
-
-declare <vscale x 2 x half> @llvm.riscv.vfrsqrte7.mask.nxv2f16(
-  <vscale x 2 x half>,
-  <vscale x 2 x half>,
-  <vscale x 2 x i1>,
-  i32);
-
-define <vscale x 2 x half> @intrinsic_vfrsqrte7_mask_v_nxv2f16_nxv2f16(<vscale x 2 x i1> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_mask_v_nxv2f16_nxv2f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,mf2,tu,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v9, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vfrsqrte7.mask.nxv2f16(
-    <vscale x 2 x half> %1,
-    <vscale x 2 x half> %2,
-    <vscale x 2 x i1> %0,
-    i32 %3)
-
-  ret <vscale x 2 x half> %a
-}
-
-declare <vscale x 4 x half> @llvm.riscv.vfrsqrte7.nxv4f16(
-  <vscale x 4 x half>,
-  i32);
-
-define <vscale x 4 x half> @intrinsic_vfrsqrte7_v_nxv4f16_nxv4f16(<vscale x 4 x half> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_v_nxv4f16_nxv4f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,m1,ta,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vfrsqrte7.nxv4f16(
-    <vscale x 4 x half> %0,
-    i32 %1)
-
-  ret <vscale x 4 x half> %a
-}
-
-declare <vscale x 4 x half> @llvm.riscv.vfrsqrte7.mask.nxv4f16(
-  <vscale x 4 x half>,
-  <vscale x 4 x half>,
-  <vscale x 4 x i1>,
-  i32);
-
-define <vscale x 4 x half> @intrinsic_vfrsqrte7_mask_v_nxv4f16_nxv4f16(<vscale x 4 x i1> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_mask_v_nxv4f16_nxv4f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,m1,tu,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v9, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vfrsqrte7.mask.nxv4f16(
-    <vscale x 4 x half> %1,
-    <vscale x 4 x half> %2,
-    <vscale x 4 x i1> %0,
-    i32 %3)
-
-  ret <vscale x 4 x half> %a
-}
-
-declare <vscale x 8 x half> @llvm.riscv.vfrsqrte7.nxv8f16(
-  <vscale x 8 x half>,
-  i32);
-
-define <vscale x 8 x half> @intrinsic_vfrsqrte7_v_nxv8f16_nxv8f16(<vscale x 8 x half> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_v_nxv8f16_nxv8f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,m2,ta,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vfrsqrte7.nxv8f16(
-    <vscale x 8 x half> %0,
-    i32 %1)
-
-  ret <vscale x 8 x half> %a
-}
-
-declare <vscale x 8 x half> @llvm.riscv.vfrsqrte7.mask.nxv8f16(
-  <vscale x 8 x half>,
-  <vscale x 8 x half>,
-  <vscale x 8 x i1>,
-  i32);
-
-define <vscale x 8 x half> @intrinsic_vfrsqrte7_mask_v_nxv8f16_nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_mask_v_nxv8f16_nxv8f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,m2,tu,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v10, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vfrsqrte7.mask.nxv8f16(
-    <vscale x 8 x half> %1,
-    <vscale x 8 x half> %2,
-    <vscale x 8 x i1> %0,
-    i32 %3)
-
-  ret <vscale x 8 x half> %a
-}
-
-declare <vscale x 16 x half> @llvm.riscv.vfrsqrte7.nxv16f16(
-  <vscale x 16 x half>,
-  i32);
-
-define <vscale x 16 x half> @intrinsic_vfrsqrte7_v_nxv16f16_nxv16f16(<vscale x 16 x half> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_v_nxv16f16_nxv16f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,m4,ta,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vfrsqrte7.nxv16f16(
-    <vscale x 16 x half> %0,
-    i32 %1)
-
-  ret <vscale x 16 x half> %a
-}
-
-declare <vscale x 16 x half> @llvm.riscv.vfrsqrte7.mask.nxv16f16(
-  <vscale x 16 x half>,
-  <vscale x 16 x half>,
-  <vscale x 16 x i1>,
-  i32);
-
-define <vscale x 16 x half> @intrinsic_vfrsqrte7_mask_v_nxv16f16_nxv16f16(<vscale x 16 x i1> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_mask_v_nxv16f16_nxv16f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,m4,tu,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v12, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vfrsqrte7.mask.nxv16f16(
-    <vscale x 16 x half> %1,
-    <vscale x 16 x half> %2,
-    <vscale x 16 x i1> %0,
-    i32 %3)
-
-  ret <vscale x 16 x half> %a
-}
-
-declare <vscale x 32 x half> @llvm.riscv.vfrsqrte7.nxv32f16(
-  <vscale x 32 x half>,
-  i32);
-
-define <vscale x 32 x half> @intrinsic_vfrsqrte7_v_nxv32f16_nxv32f16(<vscale x 32 x half> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_v_nxv32f16_nxv32f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,m8,ta,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vfrsqrte7.nxv32f16(
-    <vscale x 32 x half> %0,
-    i32 %1)
-
-  ret <vscale x 32 x half> %a
-}
-
-declare <vscale x 32 x half> @llvm.riscv.vfrsqrte7.mask.nxv32f16(
-  <vscale x 32 x half>,
-  <vscale x 32 x half>,
-  <vscale x 32 x i1>,
-  i32);
-
-define <vscale x 32 x half> @intrinsic_vfrsqrte7_mask_v_nxv32f16_nxv32f16(<vscale x 32 x i1> %0, <vscale x 32 x half> %1, <vscale x 32 x half> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_mask_v_nxv32f16_nxv32f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,m8,tu,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v16, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vfrsqrte7.mask.nxv32f16(
-    <vscale x 32 x half> %1,
-    <vscale x 32 x half> %2,
-    <vscale x 32 x i1> %0,
-    i32 %3)
-
-  ret <vscale x 32 x half> %a
-}
-
-declare <vscale x 1 x float> @llvm.riscv.vfrsqrte7.nxv1f32(
-  <vscale x 1 x float>,
-  i32);
-
-define <vscale x 1 x float> @intrinsic_vfrsqrte7_v_nxv1f32_nxv1f32(<vscale x 1 x float> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_v_nxv1f32_nxv1f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,mf2,ta,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vfrsqrte7.nxv1f32(
-    <vscale x 1 x float> %0,
-    i32 %1)
-
-  ret <vscale x 1 x float> %a
-}
-
-declare <vscale x 1 x float> @llvm.riscv.vfrsqrte7.mask.nxv1f32(
-  <vscale x 1 x float>,
-  <vscale x 1 x float>,
-  <vscale x 1 x i1>,
-  i32);
-
-define <vscale x 1 x float> @intrinsic_vfrsqrte7_mask_v_nxv1f32_nxv1f32(<vscale x 1 x i1> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_mask_v_nxv1f32_nxv1f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,mf2,tu,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v9, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vfrsqrte7.mask.nxv1f32(
-    <vscale x 1 x float> %1,
-    <vscale x 1 x float> %2,
-    <vscale x 1 x i1> %0,
-    i32 %3)
-
-  ret <vscale x 1 x float> %a
-}
-
-declare <vscale x 2 x float> @llvm.riscv.vfrsqrte7.nxv2f32(
-  <vscale x 2 x float>,
-  i32);
-
-define <vscale x 2 x float> @intrinsic_vfrsqrte7_v_nxv2f32_nxv2f32(<vscale x 2 x float> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_v_nxv2f32_nxv2f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,m1,ta,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vfrsqrte7.nxv2f32(
-    <vscale x 2 x float> %0,
-    i32 %1)
-
-  ret <vscale x 2 x float> %a
-}
-
-declare <vscale x 2 x float> @llvm.riscv.vfrsqrte7.mask.nxv2f32(
-  <vscale x 2 x float>,
-  <vscale x 2 x float>,
-  <vscale x 2 x i1>,
-  i32);
-
-define <vscale x 2 x float> @intrinsic_vfrsqrte7_mask_v_nxv2f32_nxv2f32(<vscale x 2 x i1> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_mask_v_nxv2f32_nxv2f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,m1,tu,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v9, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vfrsqrte7.mask.nxv2f32(
-    <vscale x 2 x float> %1,
-    <vscale x 2 x float> %2,
-    <vscale x 2 x i1> %0,
-    i32 %3)
-
-  ret <vscale x 2 x float> %a
-}
-
-declare <vscale x 4 x float> @llvm.riscv.vfrsqrte7.nxv4f32(
-  <vscale x 4 x float>,
-  i32);
-
-define <vscale x 4 x float> @intrinsic_vfrsqrte7_v_nxv4f32_nxv4f32(<vscale x 4 x float> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_v_nxv4f32_nxv4f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,m2,ta,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vfrsqrte7.nxv4f32(
-    <vscale x 4 x float> %0,
-    i32 %1)
-
-  ret <vscale x 4 x float> %a
-}
-
-declare <vscale x 4 x float> @llvm.riscv.vfrsqrte7.mask.nxv4f32(
-  <vscale x 4 x float>,
-  <vscale x 4 x float>,
-  <vscale x 4 x i1>,
-  i32);
-
-define <vscale x 4 x float> @intrinsic_vfrsqrte7_mask_v_nxv4f32_nxv4f32(<vscale x 4 x i1> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_mask_v_nxv4f32_nxv4f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,m2,tu,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v10, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vfrsqrte7.mask.nxv4f32(
-    <vscale x 4 x float> %1,
-    <vscale x 4 x float> %2,
-    <vscale x 4 x i1> %0,
-    i32 %3)
-
-  ret <vscale x 4 x float> %a
-}
-
-declare <vscale x 8 x float> @llvm.riscv.vfrsqrte7.nxv8f32(
-  <vscale x 8 x float>,
-  i32);
-
-define <vscale x 8 x float> @intrinsic_vfrsqrte7_v_nxv8f32_nxv8f32(<vscale x 8 x float> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_v_nxv8f32_nxv8f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,m4,ta,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vfrsqrte7.nxv8f32(
-    <vscale x 8 x float> %0,
-    i32 %1)
-
-  ret <vscale x 8 x float> %a
-}
-
-declare <vscale x 8 x float> @llvm.riscv.vfrsqrte7.mask.nxv8f32(
-  <vscale x 8 x float>,
-  <vscale x 8 x float>,
-  <vscale x 8 x i1>,
-  i32);
-
-define <vscale x 8 x float> @intrinsic_vfrsqrte7_mask_v_nxv8f32_nxv8f32(<vscale x 8 x i1> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_mask_v_nxv8f32_nxv8f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,m4,tu,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v12, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vfrsqrte7.mask.nxv8f32(
-    <vscale x 8 x float> %1,
-    <vscale x 8 x float> %2,
-    <vscale x 8 x i1> %0,
-    i32 %3)
-
-  ret <vscale x 8 x float> %a
-}
-
-declare <vscale x 16 x float> @llvm.riscv.vfrsqrte7.nxv16f32(
-  <vscale x 16 x float>,
-  i32);
-
-define <vscale x 16 x float> @intrinsic_vfrsqrte7_v_nxv16f32_nxv16f32(<vscale x 16 x float> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_v_nxv16f32_nxv16f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,m8,ta,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vfrsqrte7.nxv16f32(
-    <vscale x 16 x float> %0,
-    i32 %1)
-
-  ret <vscale x 16 x float> %a
-}
-
-declare <vscale x 16 x float> @llvm.riscv.vfrsqrte7.mask.nxv16f32(
-  <vscale x 16 x float>,
-  <vscale x 16 x float>,
-  <vscale x 16 x i1>,
-  i32);
-
-define <vscale x 16 x float> @intrinsic_vfrsqrte7_mask_v_nxv16f32_nxv16f32(<vscale x 16 x i1> %0, <vscale x 16 x float> %1, <vscale x 16 x float> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_mask_v_nxv16f32_nxv16f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,m8,tu,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v16, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vfrsqrte7.mask.nxv16f32(
-    <vscale x 16 x float> %1,
-    <vscale x 16 x float> %2,
-    <vscale x 16 x i1> %0,
-    i32 %3)
-
-  ret <vscale x 16 x float> %a
-}
-
-declare <vscale x 1 x double> @llvm.riscv.vfrsqrte7.nxv1f64(
-  <vscale x 1 x double>,
-  i32);
-
-define <vscale x 1 x double> @intrinsic_vfrsqrte7_v_nxv1f64_nxv1f64(<vscale x 1 x double> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_v_nxv1f64_nxv1f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e64,m1,ta,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vfrsqrte7.nxv1f64(
-    <vscale x 1 x double> %0,
-    i32 %1)
-
-  ret <vscale x 1 x double> %a
-}
-
-declare <vscale x 1 x double> @llvm.riscv.vfrsqrte7.mask.nxv1f64(
-  <vscale x 1 x double>,
-  <vscale x 1 x double>,
-  <vscale x 1 x i1>,
-  i32);
-
-define <vscale x 1 x double> @intrinsic_vfrsqrte7_mask_v_nxv1f64_nxv1f64(<vscale x 1 x i1> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_mask_v_nxv1f64_nxv1f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e64,m1,tu,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v9, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vfrsqrte7.mask.nxv1f64(
-    <vscale x 1 x double> %1,
-    <vscale x 1 x double> %2,
-    <vscale x 1 x i1> %0,
-    i32 %3)
-
-  ret <vscale x 1 x double> %a
-}
-
-declare <vscale x 2 x double> @llvm.riscv.vfrsqrte7.nxv2f64(
-  <vscale x 2 x double>,
-  i32);
-
-define <vscale x 2 x double> @intrinsic_vfrsqrte7_v_nxv2f64_nxv2f64(<vscale x 2 x double> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_v_nxv2f64_nxv2f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e64,m2,ta,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vfrsqrte7.nxv2f64(
-    <vscale x 2 x double> %0,
-    i32 %1)
-
-  ret <vscale x 2 x double> %a
-}
-
-declare <vscale x 2 x double> @llvm.riscv.vfrsqrte7.mask.nxv2f64(
-  <vscale x 2 x double>,
-  <vscale x 2 x double>,
-  <vscale x 2 x i1>,
-  i32);
-
-define <vscale x 2 x double> @intrinsic_vfrsqrte7_mask_v_nxv2f64_nxv2f64(<vscale x 2 x i1> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_mask_v_nxv2f64_nxv2f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e64,m2,tu,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v10, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vfrsqrte7.mask.nxv2f64(
-    <vscale x 2 x double> %1,
-    <vscale x 2 x double> %2,
-    <vscale x 2 x i1> %0,
-    i32 %3)
-
-  ret <vscale x 2 x double> %a
-}
-
-declare <vscale x 4 x double> @llvm.riscv.vfrsqrte7.nxv4f64(
-  <vscale x 4 x double>,
-  i32);
-
-define <vscale x 4 x double> @intrinsic_vfrsqrte7_v_nxv4f64_nxv4f64(<vscale x 4 x double> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_v_nxv4f64_nxv4f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e64,m4,ta,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vfrsqrte7.nxv4f64(
-    <vscale x 4 x double> %0,
-    i32 %1)
-
-  ret <vscale x 4 x double> %a
-}
-
-declare <vscale x 4 x double> @llvm.riscv.vfrsqrte7.mask.nxv4f64(
-  <vscale x 4 x double>,
-  <vscale x 4 x double>,
-  <vscale x 4 x i1>,
-  i32);
-
-define <vscale x 4 x double> @intrinsic_vfrsqrte7_mask_v_nxv4f64_nxv4f64(<vscale x 4 x i1> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_mask_v_nxv4f64_nxv4f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e64,m4,tu,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v12, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vfrsqrte7.mask.nxv4f64(
-    <vscale x 4 x double> %1,
-    <vscale x 4 x double> %2,
-    <vscale x 4 x i1> %0,
-    i32 %3)
-
-  ret <vscale x 4 x double> %a
-}
-
-declare <vscale x 8 x double> @llvm.riscv.vfrsqrte7.nxv8f64(
-  <vscale x 8 x double>,
-  i32);
-
-define <vscale x 8 x double> @intrinsic_vfrsqrte7_v_nxv8f64_nxv8f64(<vscale x 8 x double> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_v_nxv8f64_nxv8f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e64,m8,ta,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vfrsqrte7.nxv8f64(
-    <vscale x 8 x double> %0,
-    i32 %1)
-
-  ret <vscale x 8 x double> %a
-}
-
-declare <vscale x 8 x double> @llvm.riscv.vfrsqrte7.mask.nxv8f64(
-  <vscale x 8 x double>,
-  <vscale x 8 x double>,
-  <vscale x 8 x i1>,
-  i32);
-
-define <vscale x 8 x double> @intrinsic_vfrsqrte7_mask_v_nxv8f64_nxv8f64(<vscale x 8 x i1> %0, <vscale x 8 x double> %1, <vscale x 8 x double> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_mask_v_nxv8f64_nxv8f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e64,m8,tu,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v16, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vfrsqrte7.mask.nxv8f64(
-    <vscale x 8 x double> %1,
-    <vscale x 8 x double> %2,
-    <vscale x 8 x i1> %0,
-    i32 %3)
-
-  ret <vscale x 8 x double> %a
-}
diff --git a/llvm/test/CodeGen/RISCV/vfrsqrte7-rv64.ll b/llvm/test/CodeGen/RISCV/vfrsqrte7-rv64.ll
deleted file mode 100644
index d0f4c9c4ac58..000000000000
--- a/llvm/test/CodeGen/RISCV/vfrsqrte7-rv64.ll
+++ /dev/null
@@ -1,602 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+experimental-v,+d,+experimental-zfh -verify-machineinstrs \
-; RUN:   --riscv-no-aliases < %s | FileCheck %s
-declare <vscale x 1 x half> @llvm.riscv.vfrsqrte7.nxv1f16(
-  <vscale x 1 x half>,
-  i64);
-
-define <vscale x 1 x half> @intrinsic_vfrsqrte7_v_nxv1f16_nxv1f16(<vscale x 1 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_v_nxv1f16_nxv1f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,mf4,ta,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vfrsqrte7.nxv1f16(
-    <vscale x 1 x half> %0,
-    i64 %1)
-
-  ret <vscale x 1 x half> %a
-}
-
-declare <vscale x 1 x half> @llvm.riscv.vfrsqrte7.mask.nxv1f16(
-  <vscale x 1 x half>,
-  <vscale x 1 x half>,
-  <vscale x 1 x i1>,
-  i64);
-
-define <vscale x 1 x half> @intrinsic_vfrsqrte7_mask_v_nxv1f16_nxv1f16(<vscale x 1 x i1> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_mask_v_nxv1f16_nxv1f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,mf4,tu,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v9, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vfrsqrte7.mask.nxv1f16(
-    <vscale x 1 x half> %1,
-    <vscale x 1 x half> %2,
-    <vscale x 1 x i1> %0,
-    i64 %3)
-
-  ret <vscale x 1 x half> %a
-}
-
-declare <vscale x 2 x half> @llvm.riscv.vfrsqrte7.nxv2f16(
-  <vscale x 2 x half>,
-  i64);
-
-define <vscale x 2 x half> @intrinsic_vfrsqrte7_v_nxv2f16_nxv2f16(<vscale x 2 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_v_nxv2f16_nxv2f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,mf2,ta,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vfrsqrte7.nxv2f16(
-    <vscale x 2 x half> %0,
-    i64 %1)
-
-  ret <vscale x 2 x half> %a
-}
-
-declare <vscale x 2 x half> @llvm.riscv.vfrsqrte7.mask.nxv2f16(
-  <vscale x 2 x half>,
-  <vscale x 2 x half>,
-  <vscale x 2 x i1>,
-  i64);
-
-define <vscale x 2 x half> @intrinsic_vfrsqrte7_mask_v_nxv2f16_nxv2f16(<vscale x 2 x i1> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_mask_v_nxv2f16_nxv2f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,mf2,tu,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v9, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vfrsqrte7.mask.nxv2f16(
-    <vscale x 2 x half> %1,
-    <vscale x 2 x half> %2,
-    <vscale x 2 x i1> %0,
-    i64 %3)
-
-  ret <vscale x 2 x half> %a
-}
-
-declare <vscale x 4 x half> @llvm.riscv.vfrsqrte7.nxv4f16(
-  <vscale x 4 x half>,
-  i64);
-
-define <vscale x 4 x half> @intrinsic_vfrsqrte7_v_nxv4f16_nxv4f16(<vscale x 4 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_v_nxv4f16_nxv4f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,m1,ta,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vfrsqrte7.nxv4f16(
-    <vscale x 4 x half> %0,
-    i64 %1)
-
-  ret <vscale x 4 x half> %a
-}
-
-declare <vscale x 4 x half> @llvm.riscv.vfrsqrte7.mask.nxv4f16(
-  <vscale x 4 x half>,
-  <vscale x 4 x half>,
-  <vscale x 4 x i1>,
-  i64);
-
-define <vscale x 4 x half> @intrinsic_vfrsqrte7_mask_v_nxv4f16_nxv4f16(<vscale x 4 x i1> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_mask_v_nxv4f16_nxv4f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,m1,tu,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v9, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vfrsqrte7.mask.nxv4f16(
-    <vscale x 4 x half> %1,
-    <vscale x 4 x half> %2,
-    <vscale x 4 x i1> %0,
-    i64 %3)
-
-  ret <vscale x 4 x half> %a
-}
-
-declare <vscale x 8 x half> @llvm.riscv.vfrsqrte7.nxv8f16(
-  <vscale x 8 x half>,
-  i64);
-
-define <vscale x 8 x half> @intrinsic_vfrsqrte7_v_nxv8f16_nxv8f16(<vscale x 8 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_v_nxv8f16_nxv8f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,m2,ta,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vfrsqrte7.nxv8f16(
-    <vscale x 8 x half> %0,
-    i64 %1)
-
-  ret <vscale x 8 x half> %a
-}
-
-declare <vscale x 8 x half> @llvm.riscv.vfrsqrte7.mask.nxv8f16(
-  <vscale x 8 x half>,
-  <vscale x 8 x half>,
-  <vscale x 8 x i1>,
-  i64);
-
-define <vscale x 8 x half> @intrinsic_vfrsqrte7_mask_v_nxv8f16_nxv8f16(<vscale x 8 x i1> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_mask_v_nxv8f16_nxv8f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,m2,tu,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v10, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vfrsqrte7.mask.nxv8f16(
-    <vscale x 8 x half> %1,
-    <vscale x 8 x half> %2,
-    <vscale x 8 x i1> %0,
-    i64 %3)
-
-  ret <vscale x 8 x half> %a
-}
-
-declare <vscale x 16 x half> @llvm.riscv.vfrsqrte7.nxv16f16(
-  <vscale x 16 x half>,
-  i64);
-
-define <vscale x 16 x half> @intrinsic_vfrsqrte7_v_nxv16f16_nxv16f16(<vscale x 16 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_v_nxv16f16_nxv16f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,m4,ta,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vfrsqrte7.nxv16f16(
-    <vscale x 16 x half> %0,
-    i64 %1)
-
-  ret <vscale x 16 x half> %a
-}
-
-declare <vscale x 16 x half> @llvm.riscv.vfrsqrte7.mask.nxv16f16(
-  <vscale x 16 x half>,
-  <vscale x 16 x half>,
-  <vscale x 16 x i1>,
-  i64);
-
-define <vscale x 16 x half> @intrinsic_vfrsqrte7_mask_v_nxv16f16_nxv16f16(<vscale x 16 x i1> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_mask_v_nxv16f16_nxv16f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,m4,tu,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v12, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vfrsqrte7.mask.nxv16f16(
-    <vscale x 16 x half> %1,
-    <vscale x 16 x half> %2,
-    <vscale x 16 x i1> %0,
-    i64 %3)
-
-  ret <vscale x 16 x half> %a
-}
-
-declare <vscale x 32 x half> @llvm.riscv.vfrsqrte7.nxv32f16(
-  <vscale x 32 x half>,
-  i64);
-
-define <vscale x 32 x half> @intrinsic_vfrsqrte7_v_nxv32f16_nxv32f16(<vscale x 32 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_v_nxv32f16_nxv32f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,m8,ta,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vfrsqrte7.nxv32f16(
-    <vscale x 32 x half> %0,
-    i64 %1)
-
-  ret <vscale x 32 x half> %a
-}
-
-declare <vscale x 32 x half> @llvm.riscv.vfrsqrte7.mask.nxv32f16(
-  <vscale x 32 x half>,
-  <vscale x 32 x half>,
-  <vscale x 32 x i1>,
-  i64);
-
-define <vscale x 32 x half> @intrinsic_vfrsqrte7_mask_v_nxv32f16_nxv32f16(<vscale x 32 x i1> %0, <vscale x 32 x half> %1, <vscale x 32 x half> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_mask_v_nxv32f16_nxv32f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e16,m8,tu,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v16, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vfrsqrte7.mask.nxv32f16(
-    <vscale x 32 x half> %1,
-    <vscale x 32 x half> %2,
-    <vscale x 32 x i1> %0,
-    i64 %3)
-
-  ret <vscale x 32 x half> %a
-}
-
-declare <vscale x 1 x float> @llvm.riscv.vfrsqrte7.nxv1f32(
-  <vscale x 1 x float>,
-  i64);
-
-define <vscale x 1 x float> @intrinsic_vfrsqrte7_v_nxv1f32_nxv1f32(<vscale x 1 x float> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_v_nxv1f32_nxv1f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,mf2,ta,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vfrsqrte7.nxv1f32(
-    <vscale x 1 x float> %0,
-    i64 %1)
-
-  ret <vscale x 1 x float> %a
-}
-
-declare <vscale x 1 x float> @llvm.riscv.vfrsqrte7.mask.nxv1f32(
-  <vscale x 1 x float>,
-  <vscale x 1 x float>,
-  <vscale x 1 x i1>,
-  i64);
-
-define <vscale x 1 x float> @intrinsic_vfrsqrte7_mask_v_nxv1f32_nxv1f32(<vscale x 1 x i1> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_mask_v_nxv1f32_nxv1f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,mf2,tu,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v9, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vfrsqrte7.mask.nxv1f32(
-    <vscale x 1 x float> %1,
-    <vscale x 1 x float> %2,
-    <vscale x 1 x i1> %0,
-    i64 %3)
-
-  ret <vscale x 1 x float> %a
-}
-
-declare <vscale x 2 x float> @llvm.riscv.vfrsqrte7.nxv2f32(
-  <vscale x 2 x float>,
-  i64);
-
-define <vscale x 2 x float> @intrinsic_vfrsqrte7_v_nxv2f32_nxv2f32(<vscale x 2 x float> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_v_nxv2f32_nxv2f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,m1,ta,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vfrsqrte7.nxv2f32(
-    <vscale x 2 x float> %0,
-    i64 %1)
-
-  ret <vscale x 2 x float> %a
-}
-
-declare <vscale x 2 x float> @llvm.riscv.vfrsqrte7.mask.nxv2f32(
-  <vscale x 2 x float>,
-  <vscale x 2 x float>,
-  <vscale x 2 x i1>,
-  i64);
-
-define <vscale x 2 x float> @intrinsic_vfrsqrte7_mask_v_nxv2f32_nxv2f32(<vscale x 2 x i1> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_mask_v_nxv2f32_nxv2f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,m1,tu,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v9, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vfrsqrte7.mask.nxv2f32(
-    <vscale x 2 x float> %1,
-    <vscale x 2 x float> %2,
-    <vscale x 2 x i1> %0,
-    i64 %3)
-
-  ret <vscale x 2 x float> %a
-}
-
-declare <vscale x 4 x float> @llvm.riscv.vfrsqrte7.nxv4f32(
-  <vscale x 4 x float>,
-  i64);
-
-define <vscale x 4 x float> @intrinsic_vfrsqrte7_v_nxv4f32_nxv4f32(<vscale x 4 x float> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_v_nxv4f32_nxv4f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,m2,ta,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vfrsqrte7.nxv4f32(
-    <vscale x 4 x float> %0,
-    i64 %1)
-
-  ret <vscale x 4 x float> %a
-}
-
-declare <vscale x 4 x float> @llvm.riscv.vfrsqrte7.mask.nxv4f32(
-  <vscale x 4 x float>,
-  <vscale x 4 x float>,
-  <vscale x 4 x i1>,
-  i64);
-
-define <vscale x 4 x float> @intrinsic_vfrsqrte7_mask_v_nxv4f32_nxv4f32(<vscale x 4 x i1> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_mask_v_nxv4f32_nxv4f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,m2,tu,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v10, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vfrsqrte7.mask.nxv4f32(
-    <vscale x 4 x float> %1,
-    <vscale x 4 x float> %2,
-    <vscale x 4 x i1> %0,
-    i64 %3)
-
-  ret <vscale x 4 x float> %a
-}
-
-declare <vscale x 8 x float> @llvm.riscv.vfrsqrte7.nxv8f32(
-  <vscale x 8 x float>,
-  i64);
-
-define <vscale x 8 x float> @intrinsic_vfrsqrte7_v_nxv8f32_nxv8f32(<vscale x 8 x float> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_v_nxv8f32_nxv8f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,m4,ta,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vfrsqrte7.nxv8f32(
-    <vscale x 8 x float> %0,
-    i64 %1)
-
-  ret <vscale x 8 x float> %a
-}
-
-declare <vscale x 8 x float> @llvm.riscv.vfrsqrte7.mask.nxv8f32(
-  <vscale x 8 x float>,
-  <vscale x 8 x float>,
-  <vscale x 8 x i1>,
-  i64);
-
-define <vscale x 8 x float> @intrinsic_vfrsqrte7_mask_v_nxv8f32_nxv8f32(<vscale x 8 x i1> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_mask_v_nxv8f32_nxv8f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,m4,tu,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v12, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vfrsqrte7.mask.nxv8f32(
-    <vscale x 8 x float> %1,
-    <vscale x 8 x float> %2,
-    <vscale x 8 x i1> %0,
-    i64 %3)
-
-  ret <vscale x 8 x float> %a
-}
-
-declare <vscale x 16 x float> @llvm.riscv.vfrsqrte7.nxv16f32(
-  <vscale x 16 x float>,
-  i64);
-
-define <vscale x 16 x float> @intrinsic_vfrsqrte7_v_nxv16f32_nxv16f32(<vscale x 16 x float> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_v_nxv16f32_nxv16f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,m8,ta,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vfrsqrte7.nxv16f32(
-    <vscale x 16 x float> %0,
-    i64 %1)
-
-  ret <vscale x 16 x float> %a
-}
-
-declare <vscale x 16 x float> @llvm.riscv.vfrsqrte7.mask.nxv16f32(
-  <vscale x 16 x float>,
-  <vscale x 16 x float>,
-  <vscale x 16 x i1>,
-  i64);
-
-define <vscale x 16 x float> @intrinsic_vfrsqrte7_mask_v_nxv16f32_nxv16f32(<vscale x 16 x i1> %0, <vscale x 16 x float> %1, <vscale x 16 x float> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_mask_v_nxv16f32_nxv16f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e32,m8,tu,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v16, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vfrsqrte7.mask.nxv16f32(
-    <vscale x 16 x float> %1,
-    <vscale x 16 x float> %2,
-    <vscale x 16 x i1> %0,
-    i64 %3)
-
-  ret <vscale x 16 x float> %a
-}
-
-declare <vscale x 1 x double> @llvm.riscv.vfrsqrte7.nxv1f64(
-  <vscale x 1 x double>,
-  i64);
-
-define <vscale x 1 x double> @intrinsic_vfrsqrte7_v_nxv1f64_nxv1f64(<vscale x 1 x double> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_v_nxv1f64_nxv1f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e64,m1,ta,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vfrsqrte7.nxv1f64(
-    <vscale x 1 x double> %0,
-    i64 %1)
-
-  ret <vscale x 1 x double> %a
-}
-
-declare <vscale x 1 x double> @llvm.riscv.vfrsqrte7.mask.nxv1f64(
-  <vscale x 1 x double>,
-  <vscale x 1 x double>,
-  <vscale x 1 x i1>,
-  i64);
-
-define <vscale x 1 x double> @intrinsic_vfrsqrte7_mask_v_nxv1f64_nxv1f64(<vscale x 1 x i1> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_mask_v_nxv1f64_nxv1f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e64,m1,tu,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v9, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vfrsqrte7.mask.nxv1f64(
-    <vscale x 1 x double> %1,
-    <vscale x 1 x double> %2,
-    <vscale x 1 x i1> %0,
-    i64 %3)
-
-  ret <vscale x 1 x double> %a
-}
-
-declare <vscale x 2 x double> @llvm.riscv.vfrsqrte7.nxv2f64(
-  <vscale x 2 x double>,
-  i64);
-
-define <vscale x 2 x double> @intrinsic_vfrsqrte7_v_nxv2f64_nxv2f64(<vscale x 2 x double> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_v_nxv2f64_nxv2f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e64,m2,ta,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vfrsqrte7.nxv2f64(
-    <vscale x 2 x double> %0,
-    i64 %1)
-
-  ret <vscale x 2 x double> %a
-}
-
-declare <vscale x 2 x double> @llvm.riscv.vfrsqrte7.mask.nxv2f64(
-  <vscale x 2 x double>,
-  <vscale x 2 x double>,
-  <vscale x 2 x i1>,
-  i64);
-
-define <vscale x 2 x double> @intrinsic_vfrsqrte7_mask_v_nxv2f64_nxv2f64(<vscale x 2 x i1> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_mask_v_nxv2f64_nxv2f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e64,m2,tu,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v10, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vfrsqrte7.mask.nxv2f64(
-    <vscale x 2 x double> %1,
-    <vscale x 2 x double> %2,
-    <vscale x 2 x i1> %0,
-    i64 %3)
-
-  ret <vscale x 2 x double> %a
-}
-
-declare <vscale x 4 x double> @llvm.riscv.vfrsqrte7.nxv4f64(
-  <vscale x 4 x double>,
-  i64);
-
-define <vscale x 4 x double> @intrinsic_vfrsqrte7_v_nxv4f64_nxv4f64(<vscale x 4 x double> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_v_nxv4f64_nxv4f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e64,m4,ta,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vfrsqrte7.nxv4f64(
-    <vscale x 4 x double> %0,
-    i64 %1)
-
-  ret <vscale x 4 x double> %a
-}
-
-declare <vscale x 4 x double> @llvm.riscv.vfrsqrte7.mask.nxv4f64(
-  <vscale x 4 x double>,
-  <vscale x 4 x double>,
-  <vscale x 4 x i1>,
-  i64);
-
-define <vscale x 4 x double> @intrinsic_vfrsqrte7_mask_v_nxv4f64_nxv4f64(<vscale x 4 x i1> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_mask_v_nxv4f64_nxv4f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e64,m4,tu,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v12, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vfrsqrte7.mask.nxv4f64(
-    <vscale x 4 x double> %1,
-    <vscale x 4 x double> %2,
-    <vscale x 4 x i1> %0,
-    i64 %3)
-
-  ret <vscale x 4 x double> %a
-}
-
-declare <vscale x 8 x double> @llvm.riscv.vfrsqrte7.nxv8f64(
-  <vscale x 8 x double>,
-  i64);
-
-define <vscale x 8 x double> @intrinsic_vfrsqrte7_v_nxv8f64_nxv8f64(<vscale x 8 x double> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_v_nxv8f64_nxv8f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e64,m8,ta,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v8
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vfrsqrte7.nxv8f64(
-    <vscale x 8 x double> %0,
-    i64 %1)
-
-  ret <vscale x 8 x double> %a
-}
-
-declare <vscale x 8 x double> @llvm.riscv.vfrsqrte7.mask.nxv8f64(
-  <vscale x 8 x double>,
-  <vscale x 8 x double>,
-  <vscale x 8 x i1>,
-  i64);
-
-define <vscale x 8 x double> @intrinsic_vfrsqrte7_mask_v_nxv8f64_nxv8f64(<vscale x 8 x i1> %0, <vscale x 8 x double> %1, <vscale x 8 x double> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vfrsqrte7_mask_v_nxv8f64_nxv8f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, a0, e64,m8,tu,mu
-; CHECK-NEXT:    vfrsqrte7.v v8, v16, v0.t
-; CHECK-NEXT:    jalr zero, 0(ra)
-entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vfrsqrte7.mask.nxv8f64(
-    <vscale x 8 x double> %1,
-    <vscale x 8 x double> %2,
-    <vscale x 8 x i1> %0,
-    i64 %3)
-
-  ret <vscale x 8 x double> %a
-}
diff --git a/llvm/test/MC/RISCV/rvv/fothers.s b/llvm/test/MC/RISCV/rvv/fothers.s
index b54d66016fc4..f894cbc5fc23 100644
--- a/llvm/test/MC/RISCV/rvv/fothers.s
+++ b/llvm/test/MC/RISCV/rvv/fothers.s
@@ -22,26 +22,26 @@ vfsqrt.v v8, v4
 # CHECK-ERROR: instruction requires the following: 'F'{{.*}}'V'
 # CHECK-UNKNOWN: 57 14 40 4e <unknown>
 
-vfrsqrte7.v v8, v4, v0.t
-# CHECK-INST: vfrsqrte7.v v8, v4, v0.t
+vfrsqrt7.v v8, v4, v0.t
+# CHECK-INST: vfrsqrt7.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x42,0x4c]
 # CHECK-ERROR: instruction requires the following: 'F'{{.*}}'V'
 # CHECK-UNKNOWN: 57 14 42 4c <unknown>
 
-vfrsqrte7.v v8, v4
-# CHECK-INST: vfrsqrte7.v v8, v4
+vfrsqrt7.v v8, v4
+# CHECK-INST: vfrsqrt7.v v8, v4
 # CHECK-ENCODING: [0x57,0x14,0x42,0x4e]
 # CHECK-ERROR: instruction requires the following: 'F'{{.*}}'V'
 # CHECK-UNKNOWN: 57 14 42 4e <unknown>
 
-vfrece7.v v8, v4, v0.t
-# CHECK-INST: vfrece7.v v8, v4, v0.t
+vfrec7.v v8, v4, v0.t
+# CHECK-INST: vfrec7.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x94,0x42,0x4c]
 # CHECK-ERROR: instruction requires the following: 'F'{{.*}}'V'
 # CHECK-UNKNOWN: 57 94 42 4c <unknown>
 
-vfrece7.v v8, v4
-# CHECK-INST: vfrece7.v v8, v4
+vfrec7.v v8, v4
+# CHECK-INST: vfrec7.v v8, v4
 # CHECK-ENCODING: [0x57,0x94,0x42,0x4e]
 # CHECK-ERROR: instruction requires the following: 'F'{{.*}}'V'
 # CHECK-UNKNOWN: 57 94 42 4e <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/invalid.s b/llvm/test/MC/RISCV/rvv/invalid.s
index 9763dc07d12b..d298e2572636 100644
--- a/llvm/test/MC/RISCV/rvv/invalid.s
+++ b/llvm/test/MC/RISCV/rvv/invalid.s
@@ -1,6 +1,15 @@
 # RUN: not llvm-mc -triple=riscv64 --mattr=+experimental-v --mattr=+f %s 2>&1 \
 # RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
 
+vsetivli a2, 32, e8,m1
+# CHECK-ERROR: unknown operand
+
+vsetivli a2, zero, e8,m1
+# CHECK-ERROR: unknown operand
+
+vsetivli a2, 5, e31
+# CHECK-ERROR: operand must be e[8|16|32|64|128|256|512|1024],m[1|2|4|8|f2|f4|f8],[ta|tu],[ma|mu]
+
 vsetvli a2, a0, e31
 # CHECK-ERROR: operand must be e[8|16|32|64|128|256|512|1024],m[1|2|4|8|f2|f4|f8],[ta|tu],[ma|mu]
 
diff --git a/llvm/test/MC/RISCV/rvv/load.s b/llvm/test/MC/RISCV/rvv/load.s
index 4841f5757e76..3d0dbb15c36e 100644
--- a/llvm/test/MC/RISCV/rvv/load.s
+++ b/llvm/test/MC/RISCV/rvv/load.s
@@ -8,6 +8,12 @@
 # RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+experimental-v %s \
 # RUN:   | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN
 
+vle1.v v8, (a0)
+# CHECK-INST: vle1.v v8, (a0)
+# CHECK-ENCODING: [0x07,0x04,0xb5,0x00]
+# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
+# CHECK-UNKNOWN: 07 04 b5 00 <unknown>
+
 vle8.v v8, (a0), v0.t
 # CHECK-INST: vle8.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0x00]
@@ -56,54 +62,6 @@ vle64.v v8, (a0)
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
 # CHECK-UNKNOWN: 07 74 05 02 <unknown>
 
-vle128.v v8, (a0), v0.t
-# CHECK-INST: vle128.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x04,0x05,0x10]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 04 05 10 <unknown>
-
-vle128.v v8, (a0)
-# CHECK-INST: vle128.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x04,0x05,0x12]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 04 05 12 <unknown>
-
-vle256.v v8, (a0), v0.t
-# CHECK-INST: vle256.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x54,0x05,0x10]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 54 05 10 <unknown>
-
-vle256.v v8, (a0)
-# CHECK-INST: vle256.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x54,0x05,0x12]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 54 05 12 <unknown>
-
-vle512.v v8, (a0), v0.t
-# CHECK-INST: vle512.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x64,0x05,0x10]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 64 05 10 <unknown>
-
-vle512.v v8, (a0)
-# CHECK-INST: vle512.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x64,0x05,0x12]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 64 05 12 <unknown>
-
-vle1024.v v8, (a0), v0.t
-# CHECK-INST: vle1024.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x74,0x05,0x10]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 74 05 10 <unknown>
-
-vle1024.v v8, (a0)
-# CHECK-INST: vle1024.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x74,0x05,0x12]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 74 05 12 <unknown>
-
 vle8ff.v v8, (a0), v0.t
 # CHECK-INST: vle8ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0x01]
@@ -152,54 +110,6 @@ vle64ff.v v8, (a0)
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
 # CHECK-UNKNOWN: 07 74 05 03 <unknown>
 
-vle128ff.v v8, (a0), v0.t
-# CHECK-INST: vle128ff.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x04,0x05,0x11]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 04 05 11 <unknown>
-
-vle128ff.v v8, (a0)
-# CHECK-INST: vle128ff.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x04,0x05,0x13]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 04 05 13 <unknown>
-
-vle256ff.v v8, (a0), v0.t
-# CHECK-INST: vle256ff.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x54,0x05,0x11]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 54 05 11 <unknown>
-
-vle256ff.v v8, (a0)
-# CHECK-INST: vle256ff.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x54,0x05,0x13]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 54 05 13 <unknown>
-
-vle512ff.v v8, (a0), v0.t
-# CHECK-INST: vle512ff.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x64,0x05,0x11]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 64 05 11 <unknown>
-
-vle512ff.v v8, (a0)
-# CHECK-INST: vle512ff.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x64,0x05,0x13]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 64 05 13 <unknown>
-
-vle1024ff.v v8, (a0), v0.t
-# CHECK-INST: vle1024ff.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x74,0x05,0x11]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 74 05 11 <unknown>
-
-vle1024ff.v v8, (a0)
-# CHECK-INST: vle1024ff.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x74,0x05,0x13]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 74 05 13 <unknown>
-
 vlse8.v v8, (a0), a1, v0.t
 # CHECK-INST: vlse8.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x04,0xb5,0x08]
@@ -248,54 +158,6 @@ vlse64.v v8, (a0), a1
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
 # CHECK-UNKNOWN: 07 74 b5 0a <unknown>
 
-vlse128.v v8, (a0), a1, v0.t
-# CHECK-INST: vlse128.v v8, (a0), a1, v0.t
-# CHECK-ENCODING: [0x07,0x04,0xb5,0x18]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 04 b5 18 <unknown>
-
-vlse128.v v8, (a0), a1
-# CHECK-INST: vlse128.v v8, (a0), a1
-# CHECK-ENCODING: [0x07,0x04,0xb5,0x1a]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 04 b5 1a <unknown>
-
-vlse256.v v8, (a0), a1, v0.t
-# CHECK-INST: vlse256.v v8, (a0), a1, v0.t
-# CHECK-ENCODING: [0x07,0x54,0xb5,0x18]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 54 b5 18 <unknown>
-
-vlse256.v v8, (a0), a1
-# CHECK-INST: vlse256.v v8, (a0), a1
-# CHECK-ENCODING: [0x07,0x54,0xb5,0x1a]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 54 b5 1a <unknown>
-
-vlse512.v v8, (a0), a1, v0.t
-# CHECK-INST: vlse512.v v8, (a0), a1, v0.t
-# CHECK-ENCODING: [0x07,0x64,0xb5,0x18]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 64 b5 18 <unknown>
-
-vlse512.v v8, (a0), a1
-# CHECK-INST: vlse512.v v8, (a0), a1
-# CHECK-ENCODING: [0x07,0x64,0xb5,0x1a]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 64 b5 1a <unknown>
-
-vlse1024.v v8, (a0), a1, v0.t
-# CHECK-INST: vlse1024.v v8, (a0), a1, v0.t
-# CHECK-ENCODING: [0x07,0x74,0xb5,0x18]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 74 b5 18 <unknown>
-
-vlse1024.v v8, (a0), a1
-# CHECK-INST: vlse1024.v v8, (a0), a1
-# CHECK-ENCODING: [0x07,0x74,0xb5,0x1a]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 74 b5 1a <unknown>
-
 vluxei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0x04]
@@ -416,30 +278,6 @@ vl1re64.v v8, (a0)
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
 # CHECK-UNKNOWN: 07 74 85 22 <unknown>
 
-vl1re128.v v8, (a0)
-# CHECK-INST: vl1re128.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x04,0x85,0x32]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 04 85 32 <unknown>
-
-vl1re256.v v8, (a0)
-# CHECK-INST: vl1re256.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x54,0x85,0x32]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 54 85 32 <unknown>
-
-vl1re512.v v8, (a0)
-# CHECK-INST: vl1re512.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x64,0x85,0x32]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 64 85 32 <unknown>
-
-vl1re1024.v v8, (a0)
-# CHECK-INST: vl1re1024.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x74,0x85,0x32]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 74 85 32 <unknown>
-
 vl2re8.v v8, (a0)
 # CHECK-INST: vl2re8.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x85,0x42]
@@ -464,30 +302,6 @@ vl2re64.v v8, (a0)
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
 # CHECK-UNKNOWN: 07 74 85 42 <unknown>
 
-vl2re128.v v8, (a0)
-# CHECK-INST: vl2re128.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x04,0x85,0x52]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 04 85 52 <unknown>
-
-vl2re256.v v8, (a0)
-# CHECK-INST: vl2re256.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x54,0x85,0x52]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 54 85 52 <unknown>
-
-vl2re512.v v8, (a0)
-# CHECK-INST: vl2re512.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x64,0x85,0x52]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 64 85 52 <unknown>
-
-vl2re1024.v v8, (a0)
-# CHECK-INST: vl2re1024.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x74,0x85,0x52]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 74 85 52 <unknown>
-
 vl4re8.v v8, (a0)
 # CHECK-INST: vl4re8.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x85,0x82]
@@ -512,30 +326,6 @@ vl4re64.v v8, (a0)
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
 # CHECK-UNKNOWN: 07 74 85 82 <unknown>
 
-vl4re128.v v8, (a0)
-# CHECK-INST: vl4re128.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x04,0x85,0x92]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 04 85 92 <unknown>
-
-vl4re256.v v8, (a0)
-# CHECK-INST: vl4re256.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x54,0x85,0x92]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 54 85 92 <unknown>
-
-vl4re512.v v8, (a0)
-# CHECK-INST: vl4re512.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x64,0x85,0x92]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 64 85 92 <unknown>
-
-vl4re1024.v v8, (a0)
-# CHECK-INST: vl4re1024.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x74,0x85,0x92]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 74 85 92 <unknown>
-
 vl8re8.v v8, (a0)
 # CHECK-INST: vl8re8.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x85,0x02]
@@ -559,27 +349,3 @@ vl8re64.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x85,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
 # CHECK-UNKNOWN: 07 74 85 02 <unknown>
-
-vl8re128.v v8, (a0)
-# CHECK-INST: vl8re128.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x04,0x85,0x12]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 04 85 12 <unknown>
-
-vl8re256.v v8, (a0)
-# CHECK-INST: vl8re256.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x54,0x85,0x12]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 54 85 12 <unknown>
-
-vl8re512.v v8, (a0)
-# CHECK-INST: vl8re512.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x64,0x85,0x12]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 64 85 12 <unknown>
-
-vl8re1024.v v8, (a0)
-# CHECK-INST: vl8re1024.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x74,0x85,0x12]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 74 85 12 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/store.s b/llvm/test/MC/RISCV/rvv/store.s
index 8437bf7f9030..e4795aa1c2c9 100644
--- a/llvm/test/MC/RISCV/rvv/store.s
+++ b/llvm/test/MC/RISCV/rvv/store.s
@@ -8,6 +8,12 @@
 # RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+experimental-v %s \
 # RUN:   | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN
 
+vse1.v v24, (a0)
+# CHECK-INST: vse1.v v24, (a0)
+# CHECK-ENCODING: [0x27,0x0c,0xb5,0x00]
+# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
+# CHECK-UNKNOWN: 27 0c b5 00 <unknown>
+
 vse8.v v24, (a0), v0.t
 # CHECK-INST: vse8.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x05,0x00]
@@ -56,54 +62,6 @@ vse64.v v24, (a0)
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
 # CHECK-UNKNOWN: 27 7c 05 02 <unknown>
 
-vse128.v v24, (a0), v0.t
-# CHECK-INST: vse128.v v24, (a0), v0.t
-# CHECK-ENCODING: [0x27,0x0c,0x05,0x10]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 27 0c 05 10 <unknown>
-
-vse128.v v24, (a0)
-# CHECK-INST: vse128.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x0c,0x05,0x12]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 27 0c 05 12 <unknown>
-
-vse256.v v24, (a0), v0.t
-# CHECK-INST: vse256.v v24, (a0), v0.t
-# CHECK-ENCODING: [0x27,0x5c,0x05,0x10]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 27 5c 05 10 <unknown>
-
-vse256.v v24, (a0)
-# CHECK-INST: vse256.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x5c,0x05,0x12]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 27 5c 05 12 <unknown>
-
-vse512.v v24, (a0), v0.t
-# CHECK-INST: vse512.v v24, (a0), v0.t
-# CHECK-ENCODING: [0x27,0x6c,0x05,0x10]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 27 6c 05 10 <unknown>
-
-vse512.v v24, (a0)
-# CHECK-INST: vse512.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x6c,0x05,0x12]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 27 6c 05 12 <unknown>
-
-vse1024.v v24, (a0), v0.t
-# CHECK-INST: vse1024.v v24, (a0), v0.t
-# CHECK-ENCODING: [0x27,0x7c,0x05,0x10]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 27 7c 05 10 <unknown>
-
-vse1024.v v24, (a0)
-# CHECK-INST: vse1024.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x7c,0x05,0x12]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 27 7c 05 12 <unknown>
-
 vsse8.v v24, (a0), a1, v0.t
 # CHECK-INST: vsse8.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0x08]
@@ -152,54 +110,6 @@ vsse64.v v24, (a0), a1
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
 # CHECK-UNKNOWN: 27 7c b5 0a <unknown>
 
-vsse128.v v24, (a0), a1, v0.t
-# CHECK-INST: vsse128.v v24, (a0), a1, v0.t
-# CHECK-ENCODING: [0x27,0x0c,0xb5,0x18]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 27 0c b5 18 <unknown>
-
-vsse128.v v24, (a0), a1
-# CHECK-INST: vsse128.v v24, (a0), a1
-# CHECK-ENCODING: [0x27,0x0c,0xb5,0x1a]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 27 0c b5 1a <unknown>
-
-vsse256.v v24, (a0), a1, v0.t
-# CHECK-INST: vsse256.v v24, (a0), a1, v0.t
-# CHECK-ENCODING: [0x27,0x5c,0xb5,0x18]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 27 5c b5 18 <unknown>
-
-vsse256.v v24, (a0), a1
-# CHECK-INST: vsse256.v v24, (a0), a1
-# CHECK-ENCODING: [0x27,0x5c,0xb5,0x1a]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 27 5c b5 1a <unknown>
-
-vsse512.v v24, (a0), a1, v0.t
-# CHECK-INST: vsse512.v v24, (a0), a1, v0.t
-# CHECK-ENCODING: [0x27,0x6c,0xb5,0x18]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 27 6c b5 18 <unknown>
-
-vsse512.v v24, (a0), a1
-# CHECK-INST: vsse512.v v24, (a0), a1
-# CHECK-ENCODING: [0x27,0x6c,0xb5,0x1a]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 27 6c b5 1a <unknown>
-
-vsse1024.v v24, (a0), a1, v0.t
-# CHECK-INST: vsse1024.v v24, (a0), a1, v0.t
-# CHECK-ENCODING: [0x27,0x7c,0xb5,0x18]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 27 7c b5 18 <unknown>
-
-vsse1024.v v24, (a0), a1
-# CHECK-INST: vsse1024.v v24, (a0), a1
-# CHECK-ENCODING: [0x27,0x7c,0xb5,0x1a]
-# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 27 7c b5 1a <unknown>
-
 vsuxei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x04]
diff --git a/llvm/test/MC/RISCV/rvv/vsetvl.s b/llvm/test/MC/RISCV/rvv/vsetvl.s
index c32126b0e24f..d792908627a6 100644
--- a/llvm/test/MC/RISCV/rvv/vsetvl.s
+++ b/llvm/test/MC/RISCV/rvv/vsetvl.s
@@ -79,3 +79,21 @@ vsetvl a2, a0, a1
 # CHECK-ENCODING: [0x57,0x76,0xb5,0x80]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
 # CHECK-UNKNOWN: 57 76 b5 80 <unknown>
+
+vsetivli a2, 0, e32,m1,ta,ma
+# CHECK-INST: vsetivli a2, 0, e32,m1,ta,ma
+# CHECK-ENCODING: [0x57,0x76,0x00,0xcd]
+# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
+# CHECK-UNKNOWN: 57 76 00 cd <unknown>
+
+vsetivli a2, 15, e32,m1,ta,ma
+# CHECK-INST: vsetivli a2, 15, e32,m1,ta,ma
+# CHECK-ENCODING: [0x57,0xf6,0x07,0xcd]
+# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
+# CHECK-UNKNOWN: 57 f6 07 cd <unknown>
+
+vsetivli a2, 31, e32,m1,ta,ma
+# CHECK-INST: vsetivli a2, 31, e32,m1,ta,ma
+# CHECK-ENCODING: [0x57,0xf6,0x0f,0xcd]
+# CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
+# CHECK-UNKNOWN: 57 f6 0f cd <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/zvlsseg.s b/llvm/test/MC/RISCV/rvv/zvlsseg.s
index b41b6984bcb2..98c7a59eb31e 100644
--- a/llvm/test/MC/RISCV/rvv/zvlsseg.s
+++ b/llvm/test/MC/RISCV/rvv/zvlsseg.s
@@ -60,54 +60,6 @@ vlseg2e64.v v8, (a0)
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 07 74 05 22 <unknown>
 
-vlseg2e128.v v8, (a0), v0.t
-# CHECK-INST: vlseg2e128.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x04,0x05,0x30]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 05 30 <unknown>
-
-vlseg2e128.v v8, (a0)
-# CHECK-INST: vlseg2e128.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x04,0x05,0x32]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 05 32 <unknown>
-
-vlseg2e256.v v8, (a0), v0.t
-# CHECK-INST: vlseg2e256.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x54,0x05,0x30]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 05 30 <unknown>
-
-vlseg2e256.v v8, (a0)
-# CHECK-INST: vlseg2e256.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x54,0x05,0x32]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 05 32 <unknown>
-
-vlseg2e512.v v8, (a0), v0.t
-# CHECK-INST: vlseg2e512.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x64,0x05,0x30]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 05 30 <unknown>
-
-vlseg2e512.v v8, (a0)
-# CHECK-INST: vlseg2e512.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x64,0x05,0x32]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 05 32 <unknown>
-
-vlseg2e1024.v v8, (a0), v0.t
-# CHECK-INST: vlseg2e1024.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x74,0x05,0x30]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 05 30 <unknown>
-
-vlseg2e1024.v v8, (a0)
-# CHECK-INST: vlseg2e1024.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x74,0x05,0x32]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 05 32 <unknown>
-
 vlseg2e8ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg2e8ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0x21]
@@ -156,54 +108,6 @@ vlseg2e64ff.v v8, (a0)
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 07 74 05 23 <unknown>
 
-vlseg2e128ff.v v8, (a0), v0.t
-# CHECK-INST: vlseg2e128ff.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x04,0x05,0x31]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 05 31 <unknown>
-
-vlseg2e128ff.v v8, (a0)
-# CHECK-INST: vlseg2e128ff.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x04,0x05,0x33]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 05 33 <unknown>
-
-vlseg2e256ff.v v8, (a0), v0.t
-# CHECK-INST: vlseg2e256ff.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x54,0x05,0x31]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 05 31 <unknown>
-
-vlseg2e256ff.v v8, (a0)
-# CHECK-INST: vlseg2e256ff.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x54,0x05,0x33]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 05 33 <unknown>
-
-vlseg2e512ff.v v8, (a0), v0.t
-# CHECK-INST: vlseg2e512ff.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x64,0x05,0x31]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 05 31 <unknown>
-
-vlseg2e512ff.v v8, (a0)
-# CHECK-INST: vlseg2e512ff.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x64,0x05,0x33]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 05 33 <unknown>
-
-vlseg2e1024ff.v v8, (a0), v0.t
-# CHECK-INST: vlseg2e1024ff.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x74,0x05,0x31]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 05 31 <unknown>
-
-vlseg2e1024ff.v v8, (a0)
-# CHECK-INST: vlseg2e1024ff.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x74,0x05,0x33]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 05 33 <unknown>
-
 vlsseg2e8.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg2e8.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x04,0xb5,0x28]
@@ -252,54 +156,6 @@ vlsseg2e64.v v8, (a0), a1
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 07 74 b5 2a <unknown>
 
-vlsseg2e128.v v8, (a0), a1, v0.t
-# CHECK-INST: vlsseg2e128.v v8, (a0), a1, v0.t
-# CHECK-ENCODING: [0x07,0x04,0xb5,0x38]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 b5 38 <unknown>
-
-vlsseg2e128.v v8, (a0), a1
-# CHECK-INST: vlsseg2e128.v v8, (a0), a1
-# CHECK-ENCODING: [0x07,0x04,0xb5,0x3a]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 b5 3a <unknown>
-
-vlsseg2e256.v v8, (a0), a1, v0.t
-# CHECK-INST: vlsseg2e256.v v8, (a0), a1, v0.t
-# CHECK-ENCODING: [0x07,0x54,0xb5,0x38]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 b5 38 <unknown>
-
-vlsseg2e256.v v8, (a0), a1
-# CHECK-INST: vlsseg2e256.v v8, (a0), a1
-# CHECK-ENCODING: [0x07,0x54,0xb5,0x3a]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 b5 3a <unknown>
-
-vlsseg2e512.v v8, (a0), a1, v0.t
-# CHECK-INST: vlsseg2e512.v v8, (a0), a1, v0.t
-# CHECK-ENCODING: [0x07,0x64,0xb5,0x38]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 b5 38 <unknown>
-
-vlsseg2e512.v v8, (a0), a1
-# CHECK-INST: vlsseg2e512.v v8, (a0), a1
-# CHECK-ENCODING: [0x07,0x64,0xb5,0x3a]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 b5 3a <unknown>
-
-vlsseg2e1024.v v8, (a0), a1, v0.t
-# CHECK-INST: vlsseg2e1024.v v8, (a0), a1, v0.t
-# CHECK-ENCODING: [0x07,0x74,0xb5,0x38]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 b5 38 <unknown>
-
-vlsseg2e1024.v v8, (a0), a1
-# CHECK-INST: vlsseg2e1024.v v8, (a0), a1
-# CHECK-ENCODING: [0x07,0x74,0xb5,0x3a]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 b5 3a <unknown>
-
 vluxseg2ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg2ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0x24]
@@ -444,54 +300,6 @@ vlseg3e64.v v8, (a0)
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 07 74 05 42 <unknown>
 
-vlseg3e128.v v8, (a0), v0.t
-# CHECK-INST: vlseg3e128.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x04,0x05,0x50]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 05 50 <unknown>
-
-vlseg3e128.v v8, (a0)
-# CHECK-INST: vlseg3e128.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x04,0x05,0x52]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 05 52 <unknown>
-
-vlseg3e256.v v8, (a0), v0.t
-# CHECK-INST: vlseg3e256.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x54,0x05,0x50]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 05 50 <unknown>
-
-vlseg3e256.v v8, (a0)
-# CHECK-INST: vlseg3e256.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x54,0x05,0x52]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 05 52 <unknown>
-
-vlseg3e512.v v8, (a0), v0.t
-# CHECK-INST: vlseg3e512.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x64,0x05,0x50]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 05 50 <unknown>
-
-vlseg3e512.v v8, (a0)
-# CHECK-INST: vlseg3e512.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x64,0x05,0x52]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 05 52 <unknown>
-
-vlseg3e1024.v v8, (a0), v0.t
-# CHECK-INST: vlseg3e1024.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x74,0x05,0x50]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 05 50 <unknown>
-
-vlseg3e1024.v v8, (a0)
-# CHECK-INST: vlseg3e1024.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x74,0x05,0x52]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 05 52 <unknown>
-
 vlseg3e8ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg3e8ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0x41]
@@ -540,54 +348,6 @@ vlseg3e64ff.v v8, (a0)
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 07 74 05 43 <unknown>
 
-vlseg3e128ff.v v8, (a0), v0.t
-# CHECK-INST: vlseg3e128ff.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x04,0x05,0x51]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 05 51 <unknown>
-
-vlseg3e128ff.v v8, (a0)
-# CHECK-INST: vlseg3e128ff.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x04,0x05,0x53]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 05 53 <unknown>
-
-vlseg3e256ff.v v8, (a0), v0.t
-# CHECK-INST: vlseg3e256ff.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x54,0x05,0x51]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 05 51 <unknown>
-
-vlseg3e256ff.v v8, (a0)
-# CHECK-INST: vlseg3e256ff.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x54,0x05,0x53]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 05 53 <unknown>
-
-vlseg3e512ff.v v8, (a0), v0.t
-# CHECK-INST: vlseg3e512ff.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x64,0x05,0x51]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 05 51 <unknown>
-
-vlseg3e512ff.v v8, (a0)
-# CHECK-INST: vlseg3e512ff.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x64,0x05,0x53]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 05 53 <unknown>
-
-vlseg3e1024ff.v v8, (a0), v0.t
-# CHECK-INST: vlseg3e1024ff.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x74,0x05,0x51]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 05 51 <unknown>
-
-vlseg3e1024ff.v v8, (a0)
-# CHECK-INST: vlseg3e1024ff.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x74,0x05,0x53]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 05 53 <unknown>
-
 vlsseg3e8.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg3e8.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x04,0xb5,0x48]
@@ -636,54 +396,6 @@ vlsseg3e64.v v8, (a0), a1
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 07 74 b5 4a <unknown>
 
-vlsseg3e128.v v8, (a0), a1, v0.t
-# CHECK-INST: vlsseg3e128.v v8, (a0), a1, v0.t
-# CHECK-ENCODING: [0x07,0x04,0xb5,0x58]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 b5 58 <unknown>
-
-vlsseg3e128.v v8, (a0), a1
-# CHECK-INST: vlsseg3e128.v v8, (a0), a1
-# CHECK-ENCODING: [0x07,0x04,0xb5,0x5a]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 b5 5a <unknown>
-
-vlsseg3e256.v v8, (a0), a1, v0.t
-# CHECK-INST: vlsseg3e256.v v8, (a0), a1, v0.t
-# CHECK-ENCODING: [0x07,0x54,0xb5,0x58]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 b5 58 <unknown>
-
-vlsseg3e256.v v8, (a0), a1
-# CHECK-INST: vlsseg3e256.v v8, (a0), a1
-# CHECK-ENCODING: [0x07,0x54,0xb5,0x5a]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 b5 5a <unknown>
-
-vlsseg3e512.v v8, (a0), a1, v0.t
-# CHECK-INST: vlsseg3e512.v v8, (a0), a1, v0.t
-# CHECK-ENCODING: [0x07,0x64,0xb5,0x58]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 b5 58 <unknown>
-
-vlsseg3e512.v v8, (a0), a1
-# CHECK-INST: vlsseg3e512.v v8, (a0), a1
-# CHECK-ENCODING: [0x07,0x64,0xb5,0x5a]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 b5 5a <unknown>
-
-vlsseg3e1024.v v8, (a0), a1, v0.t
-# CHECK-INST: vlsseg3e1024.v v8, (a0), a1, v0.t
-# CHECK-ENCODING: [0x07,0x74,0xb5,0x58]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 b5 58 <unknown>
-
-vlsseg3e1024.v v8, (a0), a1
-# CHECK-INST: vlsseg3e1024.v v8, (a0), a1
-# CHECK-ENCODING: [0x07,0x74,0xb5,0x5a]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 b5 5a <unknown>
-
 vluxseg3ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg3ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0x44]
@@ -828,54 +540,6 @@ vlseg4e64.v v8, (a0)
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 07 74 05 62 <unknown>
 
-vlseg4e128.v v8, (a0), v0.t
-# CHECK-INST: vlseg4e128.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x04,0x05,0x70]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 05 70 <unknown>
-
-vlseg4e128.v v8, (a0)
-# CHECK-INST: vlseg4e128.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x04,0x05,0x72]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 05 72 <unknown>
-
-vlseg4e256.v v8, (a0), v0.t
-# CHECK-INST: vlseg4e256.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x54,0x05,0x70]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 05 70 <unknown>
-
-vlseg4e256.v v8, (a0)
-# CHECK-INST: vlseg4e256.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x54,0x05,0x72]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 05 72 <unknown>
-
-vlseg4e512.v v8, (a0), v0.t
-# CHECK-INST: vlseg4e512.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x64,0x05,0x70]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 05 70 <unknown>
-
-vlseg4e512.v v8, (a0)
-# CHECK-INST: vlseg4e512.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x64,0x05,0x72]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 05 72 <unknown>
-
-vlseg4e1024.v v8, (a0), v0.t
-# CHECK-INST: vlseg4e1024.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x74,0x05,0x70]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 05 70 <unknown>
-
-vlseg4e1024.v v8, (a0)
-# CHECK-INST: vlseg4e1024.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x74,0x05,0x72]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 05 72 <unknown>
-
 vlseg4e8ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg4e8ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0x61]
@@ -924,54 +588,6 @@ vlseg4e64ff.v v8, (a0)
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 07 74 05 63 <unknown>
 
-vlseg4e128ff.v v8, (a0), v0.t
-# CHECK-INST: vlseg4e128ff.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x04,0x05,0x71]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 05 71 <unknown>
-
-vlseg4e128ff.v v8, (a0)
-# CHECK-INST: vlseg4e128ff.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x04,0x05,0x73]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 05 73 <unknown>
-
-vlseg4e256ff.v v8, (a0), v0.t
-# CHECK-INST: vlseg4e256ff.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x54,0x05,0x71]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 05 71 <unknown>
-
-vlseg4e256ff.v v8, (a0)
-# CHECK-INST: vlseg4e256ff.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x54,0x05,0x73]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 05 73 <unknown>
-
-vlseg4e512ff.v v8, (a0), v0.t
-# CHECK-INST: vlseg4e512ff.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x64,0x05,0x71]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 05 71 <unknown>
-
-vlseg4e512ff.v v8, (a0)
-# CHECK-INST: vlseg4e512ff.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x64,0x05,0x73]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 05 73 <unknown>
-
-vlseg4e1024ff.v v8, (a0), v0.t
-# CHECK-INST: vlseg4e1024ff.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x74,0x05,0x71]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 05 71 <unknown>
-
-vlseg4e1024ff.v v8, (a0)
-# CHECK-INST: vlseg4e1024ff.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x74,0x05,0x73]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 05 73 <unknown>
-
 vlsseg4e8.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg4e8.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x04,0xb5,0x68]
@@ -1020,54 +636,6 @@ vlsseg4e64.v v8, (a0), a1
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 07 74 b5 6a <unknown>
 
-vlsseg4e128.v v8, (a0), a1, v0.t
-# CHECK-INST: vlsseg4e128.v v8, (a0), a1, v0.t
-# CHECK-ENCODING: [0x07,0x04,0xb5,0x78]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 b5 78 <unknown>
-
-vlsseg4e128.v v8, (a0), a1
-# CHECK-INST: vlsseg4e128.v v8, (a0), a1
-# CHECK-ENCODING: [0x07,0x04,0xb5,0x7a]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 b5 7a <unknown>
-
-vlsseg4e256.v v8, (a0), a1, v0.t
-# CHECK-INST: vlsseg4e256.v v8, (a0), a1, v0.t
-# CHECK-ENCODING: [0x07,0x54,0xb5,0x78]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 b5 78 <unknown>
-
-vlsseg4e256.v v8, (a0), a1
-# CHECK-INST: vlsseg4e256.v v8, (a0), a1
-# CHECK-ENCODING: [0x07,0x54,0xb5,0x7a]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 b5 7a <unknown>
-
-vlsseg4e512.v v8, (a0), a1, v0.t
-# CHECK-INST: vlsseg4e512.v v8, (a0), a1, v0.t
-# CHECK-ENCODING: [0x07,0x64,0xb5,0x78]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 b5 78 <unknown>
-
-vlsseg4e512.v v8, (a0), a1
-# CHECK-INST: vlsseg4e512.v v8, (a0), a1
-# CHECK-ENCODING: [0x07,0x64,0xb5,0x7a]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 b5 7a <unknown>
-
-vlsseg4e1024.v v8, (a0), a1, v0.t
-# CHECK-INST: vlsseg4e1024.v v8, (a0), a1, v0.t
-# CHECK-ENCODING: [0x07,0x74,0xb5,0x78]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 b5 78 <unknown>
-
-vlsseg4e1024.v v8, (a0), a1
-# CHECK-INST: vlsseg4e1024.v v8, (a0), a1
-# CHECK-ENCODING: [0x07,0x74,0xb5,0x7a]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 b5 7a <unknown>
-
 vluxseg4ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg4ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0x64]
@@ -1212,54 +780,6 @@ vlseg5e64.v v8, (a0)
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 07 74 05 82 <unknown>
 
-vlseg5e128.v v8, (a0), v0.t
-# CHECK-INST: vlseg5e128.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x04,0x05,0x90]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 05 90 <unknown>
-
-vlseg5e128.v v8, (a0)
-# CHECK-INST: vlseg5e128.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x04,0x05,0x92]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 05 92 <unknown>
-
-vlseg5e256.v v8, (a0), v0.t
-# CHECK-INST: vlseg5e256.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x54,0x05,0x90]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 05 90 <unknown>
-
-vlseg5e256.v v8, (a0)
-# CHECK-INST: vlseg5e256.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x54,0x05,0x92]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 05 92 <unknown>
-
-vlseg5e512.v v8, (a0), v0.t
-# CHECK-INST: vlseg5e512.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x64,0x05,0x90]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 05 90 <unknown>
-
-vlseg5e512.v v8, (a0)
-# CHECK-INST: vlseg5e512.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x64,0x05,0x92]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 05 92 <unknown>
-
-vlseg5e1024.v v8, (a0), v0.t
-# CHECK-INST: vlseg5e1024.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x74,0x05,0x90]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 05 90 <unknown>
-
-vlseg5e1024.v v8, (a0)
-# CHECK-INST: vlseg5e1024.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x74,0x05,0x92]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 05 92 <unknown>
-
 vlseg5e8ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg5e8ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0x81]
@@ -1308,54 +828,6 @@ vlseg5e64ff.v v8, (a0)
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 07 74 05 83 <unknown>
 
-vlseg5e128ff.v v8, (a0), v0.t
-# CHECK-INST: vlseg5e128ff.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x04,0x05,0x91]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 05 91 <unknown>
-
-vlseg5e128ff.v v8, (a0)
-# CHECK-INST: vlseg5e128ff.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x04,0x05,0x93]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 05 93 <unknown>
-
-vlseg5e256ff.v v8, (a0), v0.t
-# CHECK-INST: vlseg5e256ff.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x54,0x05,0x91]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 05 91 <unknown>
-
-vlseg5e256ff.v v8, (a0)
-# CHECK-INST: vlseg5e256ff.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x54,0x05,0x93]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 05 93 <unknown>
-
-vlseg5e512ff.v v8, (a0), v0.t
-# CHECK-INST: vlseg5e512ff.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x64,0x05,0x91]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 05 91 <unknown>
-
-vlseg5e512ff.v v8, (a0)
-# CHECK-INST: vlseg5e512ff.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x64,0x05,0x93]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 05 93 <unknown>
-
-vlseg5e1024ff.v v8, (a0), v0.t
-# CHECK-INST: vlseg5e1024ff.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x74,0x05,0x91]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 05 91 <unknown>
-
-vlseg5e1024ff.v v8, (a0)
-# CHECK-INST: vlseg5e1024ff.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x74,0x05,0x93]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 05 93 <unknown>
-
 vlsseg5e8.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg5e8.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x04,0xb5,0x88]
@@ -1404,54 +876,6 @@ vlsseg5e64.v v8, (a0), a1
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 07 74 b5 8a <unknown>
 
-vlsseg5e128.v v8, (a0), a1, v0.t
-# CHECK-INST: vlsseg5e128.v v8, (a0), a1, v0.t
-# CHECK-ENCODING: [0x07,0x04,0xb5,0x98]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 b5 98 <unknown>
-
-vlsseg5e128.v v8, (a0), a1
-# CHECK-INST: vlsseg5e128.v v8, (a0), a1
-# CHECK-ENCODING: [0x07,0x04,0xb5,0x9a]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 b5 9a <unknown>
-
-vlsseg5e256.v v8, (a0), a1, v0.t
-# CHECK-INST: vlsseg5e256.v v8, (a0), a1, v0.t
-# CHECK-ENCODING: [0x07,0x54,0xb5,0x98]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 b5 98 <unknown>
-
-vlsseg5e256.v v8, (a0), a1
-# CHECK-INST: vlsseg5e256.v v8, (a0), a1
-# CHECK-ENCODING: [0x07,0x54,0xb5,0x9a]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 b5 9a <unknown>
-
-vlsseg5e512.v v8, (a0), a1, v0.t
-# CHECK-INST: vlsseg5e512.v v8, (a0), a1, v0.t
-# CHECK-ENCODING: [0x07,0x64,0xb5,0x98]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 b5 98 <unknown>
-
-vlsseg5e512.v v8, (a0), a1
-# CHECK-INST: vlsseg5e512.v v8, (a0), a1
-# CHECK-ENCODING: [0x07,0x64,0xb5,0x9a]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 b5 9a <unknown>
-
-vlsseg5e1024.v v8, (a0), a1, v0.t
-# CHECK-INST: vlsseg5e1024.v v8, (a0), a1, v0.t
-# CHECK-ENCODING: [0x07,0x74,0xb5,0x98]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 b5 98 <unknown>
-
-vlsseg5e1024.v v8, (a0), a1
-# CHECK-INST: vlsseg5e1024.v v8, (a0), a1
-# CHECK-ENCODING: [0x07,0x74,0xb5,0x9a]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 b5 9a <unknown>
-
 vluxseg5ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg5ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0x84]
@@ -1596,54 +1020,6 @@ vlseg6e64.v v8, (a0)
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 07 74 05 a2 <unknown>
 
-vlseg6e128.v v8, (a0), v0.t
-# CHECK-INST: vlseg6e128.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x04,0x05,0xb0]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 05 b0 <unknown>
-
-vlseg6e128.v v8, (a0)
-# CHECK-INST: vlseg6e128.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x04,0x05,0xb2]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 05 b2 <unknown>
-
-vlseg6e256.v v8, (a0), v0.t
-# CHECK-INST: vlseg6e256.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x54,0x05,0xb0]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 05 b0 <unknown>
-
-vlseg6e256.v v8, (a0)
-# CHECK-INST: vlseg6e256.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x54,0x05,0xb2]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 05 b2 <unknown>
-
-vlseg6e512.v v8, (a0), v0.t
-# CHECK-INST: vlseg6e512.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x64,0x05,0xb0]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 05 b0 <unknown>
-
-vlseg6e512.v v8, (a0)
-# CHECK-INST: vlseg6e512.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x64,0x05,0xb2]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 05 b2 <unknown>
-
-vlseg6e1024.v v8, (a0), v0.t
-# CHECK-INST: vlseg6e1024.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x74,0x05,0xb0]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 05 b0 <unknown>
-
-vlseg6e1024.v v8, (a0)
-# CHECK-INST: vlseg6e1024.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x74,0x05,0xb2]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 05 b2 <unknown>
-
 vlseg6e8ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg6e8ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0xa1]
@@ -1692,54 +1068,6 @@ vlseg6e64ff.v v8, (a0)
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 07 74 05 a3 <unknown>
 
-vlseg6e128ff.v v8, (a0), v0.t
-# CHECK-INST: vlseg6e128ff.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x04,0x05,0xb1]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 05 b1 <unknown>
-
-vlseg6e128ff.v v8, (a0)
-# CHECK-INST: vlseg6e128ff.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x04,0x05,0xb3]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 05 b3 <unknown>
-
-vlseg6e256ff.v v8, (a0), v0.t
-# CHECK-INST: vlseg6e256ff.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x54,0x05,0xb1]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 05 b1 <unknown>
-
-vlseg6e256ff.v v8, (a0)
-# CHECK-INST: vlseg6e256ff.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x54,0x05,0xb3]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 05 b3 <unknown>
-
-vlseg6e512ff.v v8, (a0), v0.t
-# CHECK-INST: vlseg6e512ff.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x64,0x05,0xb1]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 05 b1 <unknown>
-
-vlseg6e512ff.v v8, (a0)
-# CHECK-INST: vlseg6e512ff.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x64,0x05,0xb3]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 05 b3 <unknown>
-
-vlseg6e1024ff.v v8, (a0), v0.t
-# CHECK-INST: vlseg6e1024ff.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x74,0x05,0xb1]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 05 b1 <unknown>
-
-vlseg6e1024ff.v v8, (a0)
-# CHECK-INST: vlseg6e1024ff.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x74,0x05,0xb3]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 05 b3 <unknown>
-
 vlsseg6e8.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg6e8.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x04,0xb5,0xa8]
@@ -1788,54 +1116,6 @@ vlsseg6e64.v v8, (a0), a1
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 07 74 b5 aa <unknown>
 
-vlsseg6e128.v v8, (a0), a1, v0.t
-# CHECK-INST: vlsseg6e128.v v8, (a0), a1, v0.t
-# CHECK-ENCODING: [0x07,0x04,0xb5,0xb8]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 b5 b8 <unknown>
-
-vlsseg6e128.v v8, (a0), a1
-# CHECK-INST: vlsseg6e128.v v8, (a0), a1
-# CHECK-ENCODING: [0x07,0x04,0xb5,0xba]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 b5 ba <unknown>
-
-vlsseg6e256.v v8, (a0), a1, v0.t
-# CHECK-INST: vlsseg6e256.v v8, (a0), a1, v0.t
-# CHECK-ENCODING: [0x07,0x54,0xb5,0xb8]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 b5 b8 <unknown>
-
-vlsseg6e256.v v8, (a0), a1
-# CHECK-INST: vlsseg6e256.v v8, (a0), a1
-# CHECK-ENCODING: [0x07,0x54,0xb5,0xba]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 b5 ba <unknown>
-
-vlsseg6e512.v v8, (a0), a1, v0.t
-# CHECK-INST: vlsseg6e512.v v8, (a0), a1, v0.t
-# CHECK-ENCODING: [0x07,0x64,0xb5,0xb8]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 b5 b8 <unknown>
-
-vlsseg6e512.v v8, (a0), a1
-# CHECK-INST: vlsseg6e512.v v8, (a0), a1
-# CHECK-ENCODING: [0x07,0x64,0xb5,0xba]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 b5 ba <unknown>
-
-vlsseg6e1024.v v8, (a0), a1, v0.t
-# CHECK-INST: vlsseg6e1024.v v8, (a0), a1, v0.t
-# CHECK-ENCODING: [0x07,0x74,0xb5,0xb8]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 b5 b8 <unknown>
-
-vlsseg6e1024.v v8, (a0), a1
-# CHECK-INST: vlsseg6e1024.v v8, (a0), a1
-# CHECK-ENCODING: [0x07,0x74,0xb5,0xba]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 b5 ba <unknown>
-
 vluxseg6ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg6ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0xa4]
@@ -1980,54 +1260,6 @@ vlseg7e64.v v8, (a0)
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 07 74 05 c2 <unknown>
 
-vlseg7e128.v v8, (a0), v0.t
-# CHECK-INST: vlseg7e128.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x04,0x05,0xd0]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 05 d0 <unknown>
-
-vlseg7e128.v v8, (a0)
-# CHECK-INST: vlseg7e128.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x04,0x05,0xd2]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 05 d2 <unknown>
-
-vlseg7e256.v v8, (a0), v0.t
-# CHECK-INST: vlseg7e256.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x54,0x05,0xd0]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 05 d0 <unknown>
-
-vlseg7e256.v v8, (a0)
-# CHECK-INST: vlseg7e256.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x54,0x05,0xd2]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 05 d2 <unknown>
-
-vlseg7e512.v v8, (a0), v0.t
-# CHECK-INST: vlseg7e512.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x64,0x05,0xd0]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 05 d0 <unknown>
-
-vlseg7e512.v v8, (a0)
-# CHECK-INST: vlseg7e512.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x64,0x05,0xd2]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 05 d2 <unknown>
-
-vlseg7e1024.v v8, (a0), v0.t
-# CHECK-INST: vlseg7e1024.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x74,0x05,0xd0]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 05 d0 <unknown>
-
-vlseg7e1024.v v8, (a0)
-# CHECK-INST: vlseg7e1024.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x74,0x05,0xd2]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 05 d2 <unknown>
-
 vlseg7e8ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg7e8ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0xc1]
@@ -2076,54 +1308,6 @@ vlseg7e64ff.v v8, (a0)
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 07 74 05 c3 <unknown>
 
-vlseg7e128ff.v v8, (a0), v0.t
-# CHECK-INST: vlseg7e128ff.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x04,0x05,0xd1]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 05 d1 <unknown>
-
-vlseg7e128ff.v v8, (a0)
-# CHECK-INST: vlseg7e128ff.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x04,0x05,0xd3]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 05 d3 <unknown>
-
-vlseg7e256ff.v v8, (a0), v0.t
-# CHECK-INST: vlseg7e256ff.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x54,0x05,0xd1]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 05 d1 <unknown>
-
-vlseg7e256ff.v v8, (a0)
-# CHECK-INST: vlseg7e256ff.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x54,0x05,0xd3]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 05 d3 <unknown>
-
-vlseg7e512ff.v v8, (a0), v0.t
-# CHECK-INST: vlseg7e512ff.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x64,0x05,0xd1]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 05 d1 <unknown>
-
-vlseg7e512ff.v v8, (a0)
-# CHECK-INST: vlseg7e512ff.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x64,0x05,0xd3]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 05 d3 <unknown>
-
-vlseg7e1024ff.v v8, (a0), v0.t
-# CHECK-INST: vlseg7e1024ff.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x74,0x05,0xd1]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 05 d1 <unknown>
-
-vlseg7e1024ff.v v8, (a0)
-# CHECK-INST: vlseg7e1024ff.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x74,0x05,0xd3]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 05 d3 <unknown>
-
 vlsseg7e8.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg7e8.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x04,0xb5,0xc8]
@@ -2172,54 +1356,6 @@ vlsseg7e64.v v8, (a0), a1
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 07 74 b5 ca <unknown>
 
-vlsseg7e128.v v8, (a0), a1, v0.t
-# CHECK-INST: vlsseg7e128.v v8, (a0), a1, v0.t
-# CHECK-ENCODING: [0x07,0x04,0xb5,0xd8]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 b5 d8 <unknown>
-
-vlsseg7e128.v v8, (a0), a1
-# CHECK-INST: vlsseg7e128.v v8, (a0), a1
-# CHECK-ENCODING: [0x07,0x04,0xb5,0xda]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 b5 da <unknown>
-
-vlsseg7e256.v v8, (a0), a1, v0.t
-# CHECK-INST: vlsseg7e256.v v8, (a0), a1, v0.t
-# CHECK-ENCODING: [0x07,0x54,0xb5,0xd8]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 b5 d8 <unknown>
-
-vlsseg7e256.v v8, (a0), a1
-# CHECK-INST: vlsseg7e256.v v8, (a0), a1
-# CHECK-ENCODING: [0x07,0x54,0xb5,0xda]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 b5 da <unknown>
-
-vlsseg7e512.v v8, (a0), a1, v0.t
-# CHECK-INST: vlsseg7e512.v v8, (a0), a1, v0.t
-# CHECK-ENCODING: [0x07,0x64,0xb5,0xd8]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 b5 d8 <unknown>
-
-vlsseg7e512.v v8, (a0), a1
-# CHECK-INST: vlsseg7e512.v v8, (a0), a1
-# CHECK-ENCODING: [0x07,0x64,0xb5,0xda]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 b5 da <unknown>
-
-vlsseg7e1024.v v8, (a0), a1, v0.t
-# CHECK-INST: vlsseg7e1024.v v8, (a0), a1, v0.t
-# CHECK-ENCODING: [0x07,0x74,0xb5,0xd8]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 b5 d8 <unknown>
-
-vlsseg7e1024.v v8, (a0), a1
-# CHECK-INST: vlsseg7e1024.v v8, (a0), a1
-# CHECK-ENCODING: [0x07,0x74,0xb5,0xda]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 b5 da <unknown>
-
 vluxseg7ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg7ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0xc4]
@@ -2364,54 +1500,6 @@ vlseg8e64.v v8, (a0)
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 07 74 05 e2 <unknown>
 
-vlseg8e128.v v8, (a0), v0.t
-# CHECK-INST: vlseg8e128.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x04,0x05,0xf0]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 05 f0 <unknown>
-
-vlseg8e128.v v8, (a0)
-# CHECK-INST: vlseg8e128.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x04,0x05,0xf2]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 05 f2 <unknown>
-
-vlseg8e256.v v8, (a0), v0.t
-# CHECK-INST: vlseg8e256.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x54,0x05,0xf0]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 05 f0 <unknown>
-
-vlseg8e256.v v8, (a0)
-# CHECK-INST: vlseg8e256.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x54,0x05,0xf2]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 05 f2 <unknown>
-
-vlseg8e512.v v8, (a0), v0.t
-# CHECK-INST: vlseg8e512.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x64,0x05,0xf0]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 05 f0 <unknown>
-
-vlseg8e512.v v8, (a0)
-# CHECK-INST: vlseg8e512.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x64,0x05,0xf2]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 05 f2 <unknown>
-
-vlseg8e1024.v v8, (a0), v0.t
-# CHECK-INST: vlseg8e1024.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x74,0x05,0xf0]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 05 f0 <unknown>
-
-vlseg8e1024.v v8, (a0)
-# CHECK-INST: vlseg8e1024.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x74,0x05,0xf2]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 05 f2 <unknown>
-
 vlseg8e8ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg8e8ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0xe1]
@@ -2460,54 +1548,6 @@ vlseg8e64ff.v v8, (a0)
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 07 74 05 e3 <unknown>
 
-vlseg8e128ff.v v8, (a0), v0.t
-# CHECK-INST: vlseg8e128ff.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x04,0x05,0xf1]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 05 f1 <unknown>
-
-vlseg8e128ff.v v8, (a0)
-# CHECK-INST: vlseg8e128ff.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x04,0x05,0xf3]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 05 f3 <unknown>
-
-vlseg8e256ff.v v8, (a0), v0.t
-# CHECK-INST: vlseg8e256ff.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x54,0x05,0xf1]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 05 f1 <unknown>
-
-vlseg8e256ff.v v8, (a0)
-# CHECK-INST: vlseg8e256ff.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x54,0x05,0xf3]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 05 f3 <unknown>
-
-vlseg8e512ff.v v8, (a0), v0.t
-# CHECK-INST: vlseg8e512ff.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x64,0x05,0xf1]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 05 f1 <unknown>
-
-vlseg8e512ff.v v8, (a0)
-# CHECK-INST: vlseg8e512ff.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x64,0x05,0xf3]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 05 f3 <unknown>
-
-vlseg8e1024ff.v v8, (a0), v0.t
-# CHECK-INST: vlseg8e1024ff.v v8, (a0), v0.t
-# CHECK-ENCODING: [0x07,0x74,0x05,0xf1]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 05 f1 <unknown>
-
-vlseg8e1024ff.v v8, (a0)
-# CHECK-INST: vlseg8e1024ff.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x74,0x05,0xf3]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 05 f3 <unknown>
-
 vlsseg8e8.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg8e8.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x04,0xb5,0xe8]
@@ -2556,54 +1596,6 @@ vlsseg8e64.v v8, (a0), a1
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 07 74 b5 ea <unknown>
 
-vlsseg8e128.v v8, (a0), a1, v0.t
-# CHECK-INST: vlsseg8e128.v v8, (a0), a1, v0.t
-# CHECK-ENCODING: [0x07,0x04,0xb5,0xf8]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 b5 f8 <unknown>
-
-vlsseg8e128.v v8, (a0), a1
-# CHECK-INST: vlsseg8e128.v v8, (a0), a1
-# CHECK-ENCODING: [0x07,0x04,0xb5,0xfa]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 04 b5 fa <unknown>
-
-vlsseg8e256.v v8, (a0), a1, v0.t
-# CHECK-INST: vlsseg8e256.v v8, (a0), a1, v0.t
-# CHECK-ENCODING: [0x07,0x54,0xb5,0xf8]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 b5 f8 <unknown>
-
-vlsseg8e256.v v8, (a0), a1
-# CHECK-INST: vlsseg8e256.v v8, (a0), a1
-# CHECK-ENCODING: [0x07,0x54,0xb5,0xfa]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 54 b5 fa <unknown>
-
-vlsseg8e512.v v8, (a0), a1, v0.t
-# CHECK-INST: vlsseg8e512.v v8, (a0), a1, v0.t
-# CHECK-ENCODING: [0x07,0x64,0xb5,0xf8]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 b5 f8 <unknown>
-
-vlsseg8e512.v v8, (a0), a1
-# CHECK-INST: vlsseg8e512.v v8, (a0), a1
-# CHECK-ENCODING: [0x07,0x64,0xb5,0xfa]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 64 b5 fa <unknown>
-
-vlsseg8e1024.v v8, (a0), a1, v0.t
-# CHECK-INST: vlsseg8e1024.v v8, (a0), a1, v0.t
-# CHECK-ENCODING: [0x07,0x74,0xb5,0xf8]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 b5 f8 <unknown>
-
-vlsseg8e1024.v v8, (a0), a1
-# CHECK-INST: vlsseg8e1024.v v8, (a0), a1
-# CHECK-ENCODING: [0x07,0x74,0xb5,0xfa]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 07 74 b5 fa <unknown>
-
 vluxseg8ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg8ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0xe4]
@@ -2748,54 +1740,6 @@ vsseg2e64.v v24, (a0)
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 27 7c 05 22 <unknown>
 
-vsseg2e128.v v24, (a0), v0.t
-# CHECK-INST: vsseg2e128.v v24, (a0), v0.t
-# CHECK-ENCODING: [0x27,0x0c,0x05,0x30]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 0c 05 30 <unknown>
-
-vsseg2e128.v v24, (a0)
-# CHECK-INST: vsseg2e128.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x0c,0x05,0x32]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 0c 05 32 <unknown>
-
-vsseg2e256.v v24, (a0), v0.t
-# CHECK-INST: vsseg2e256.v v24, (a0), v0.t
-# CHECK-ENCODING: [0x27,0x5c,0x05,0x30]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 5c 05 30 <unknown>
-
-vsseg2e256.v v24, (a0)
-# CHECK-INST: vsseg2e256.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x5c,0x05,0x32]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 5c 05 32 <unknown>
-
-vsseg2e512.v v24, (a0), v0.t
-# CHECK-INST: vsseg2e512.v v24, (a0), v0.t
-# CHECK-ENCODING: [0x27,0x6c,0x05,0x30]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 6c 05 30 <unknown>
-
-vsseg2e512.v v24, (a0)
-# CHECK-INST: vsseg2e512.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x6c,0x05,0x32]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 6c 05 32 <unknown>
-
-vsseg2e1024.v v24, (a0), v0.t
-# CHECK-INST: vsseg2e1024.v v24, (a0), v0.t
-# CHECK-ENCODING: [0x27,0x7c,0x05,0x30]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 7c 05 30 <unknown>
-
-vsseg2e1024.v v24, (a0)
-# CHECK-INST: vsseg2e1024.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x7c,0x05,0x32]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 7c 05 32 <unknown>
-
 vssseg2e8.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg2e8.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0x28]
@@ -2844,54 +1788,6 @@ vssseg2e64.v v24, (a0), a1
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 27 7c b5 2a <unknown>
 
-vssseg2e128.v v24, (a0), a1, v0.t
-# CHECK-INST: vssseg2e128.v v24, (a0), a1, v0.t
-# CHECK-ENCODING: [0x27,0x0c,0xb5,0x38]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 0c b5 38 <unknown>
-
-vssseg2e128.v v24, (a0), a1
-# CHECK-INST: vssseg2e128.v v24, (a0), a1
-# CHECK-ENCODING: [0x27,0x0c,0xb5,0x3a]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 0c b5 3a <unknown>
-
-vssseg2e256.v v24, (a0), a1, v0.t
-# CHECK-INST: vssseg2e256.v v24, (a0), a1, v0.t
-# CHECK-ENCODING: [0x27,0x5c,0xb5,0x38]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 5c b5 38 <unknown>
-
-vssseg2e256.v v24, (a0), a1
-# CHECK-INST: vssseg2e256.v v24, (a0), a1
-# CHECK-ENCODING: [0x27,0x5c,0xb5,0x3a]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 5c b5 3a <unknown>
-
-vssseg2e512.v v24, (a0), a1, v0.t
-# CHECK-INST: vssseg2e512.v v24, (a0), a1, v0.t
-# CHECK-ENCODING: [0x27,0x6c,0xb5,0x38]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 6c b5 38 <unknown>
-
-vssseg2e512.v v24, (a0), a1
-# CHECK-INST: vssseg2e512.v v24, (a0), a1
-# CHECK-ENCODING: [0x27,0x6c,0xb5,0x3a]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 6c b5 3a <unknown>
-
-vssseg2e1024.v v24, (a0), a1, v0.t
-# CHECK-INST: vssseg2e1024.v v24, (a0), a1, v0.t
-# CHECK-ENCODING: [0x27,0x7c,0xb5,0x38]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 7c b5 38 <unknown>
-
-vssseg2e1024.v v24, (a0), a1
-# CHECK-INST: vssseg2e1024.v v24, (a0), a1
-# CHECK-ENCODING: [0x27,0x7c,0xb5,0x3a]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 7c b5 3a <unknown>
-
 vsuxseg2ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg2ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x24]
@@ -3036,54 +1932,6 @@ vsseg3e64.v v24, (a0)
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 27 7c 05 42 <unknown>
 
-vsseg3e128.v v24, (a0), v0.t
-# CHECK-INST: vsseg3e128.v v24, (a0), v0.t
-# CHECK-ENCODING: [0x27,0x0c,0x05,0x50]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 0c 05 50 <unknown>
-
-vsseg3e128.v v24, (a0)
-# CHECK-INST: vsseg3e128.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x0c,0x05,0x52]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 0c 05 52 <unknown>
-
-vsseg3e256.v v24, (a0), v0.t
-# CHECK-INST: vsseg3e256.v v24, (a0), v0.t
-# CHECK-ENCODING: [0x27,0x5c,0x05,0x50]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 5c 05 50 <unknown>
-
-vsseg3e256.v v24, (a0)
-# CHECK-INST: vsseg3e256.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x5c,0x05,0x52]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 5c 05 52 <unknown>
-
-vsseg3e512.v v24, (a0), v0.t
-# CHECK-INST: vsseg3e512.v v24, (a0), v0.t
-# CHECK-ENCODING: [0x27,0x6c,0x05,0x50]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 6c 05 50 <unknown>
-
-vsseg3e512.v v24, (a0)
-# CHECK-INST: vsseg3e512.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x6c,0x05,0x52]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 6c 05 52 <unknown>
-
-vsseg3e1024.v v24, (a0), v0.t
-# CHECK-INST: vsseg3e1024.v v24, (a0), v0.t
-# CHECK-ENCODING: [0x27,0x7c,0x05,0x50]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 7c 05 50 <unknown>
-
-vsseg3e1024.v v24, (a0)
-# CHECK-INST: vsseg3e1024.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x7c,0x05,0x52]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 7c 05 52 <unknown>
-
 vssseg3e8.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg3e8.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0x48]
@@ -3132,54 +1980,6 @@ vssseg3e64.v v24, (a0), a1
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 27 7c b5 4a <unknown>
 
-vssseg3e128.v v24, (a0), a1, v0.t
-# CHECK-INST: vssseg3e128.v v24, (a0), a1, v0.t
-# CHECK-ENCODING: [0x27,0x0c,0xb5,0x58]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 0c b5 58 <unknown>
-
-vssseg3e128.v v24, (a0), a1
-# CHECK-INST: vssseg3e128.v v24, (a0), a1
-# CHECK-ENCODING: [0x27,0x0c,0xb5,0x5a]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 0c b5 5a <unknown>
-
-vssseg3e256.v v24, (a0), a1, v0.t
-# CHECK-INST: vssseg3e256.v v24, (a0), a1, v0.t
-# CHECK-ENCODING: [0x27,0x5c,0xb5,0x58]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 5c b5 58 <unknown>
-
-vssseg3e256.v v24, (a0), a1
-# CHECK-INST: vssseg3e256.v v24, (a0), a1
-# CHECK-ENCODING: [0x27,0x5c,0xb5,0x5a]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 5c b5 5a <unknown>
-
-vssseg3e512.v v24, (a0), a1, v0.t
-# CHECK-INST: vssseg3e512.v v24, (a0), a1, v0.t
-# CHECK-ENCODING: [0x27,0x6c,0xb5,0x58]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 6c b5 58 <unknown>
-
-vssseg3e512.v v24, (a0), a1
-# CHECK-INST: vssseg3e512.v v24, (a0), a1
-# CHECK-ENCODING: [0x27,0x6c,0xb5,0x5a]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 6c b5 5a <unknown>
-
-vssseg3e1024.v v24, (a0), a1, v0.t
-# CHECK-INST: vssseg3e1024.v v24, (a0), a1, v0.t
-# CHECK-ENCODING: [0x27,0x7c,0xb5,0x58]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 7c b5 58 <unknown>
-
-vssseg3e1024.v v24, (a0), a1
-# CHECK-INST: vssseg3e1024.v v24, (a0), a1
-# CHECK-ENCODING: [0x27,0x7c,0xb5,0x5a]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 7c b5 5a <unknown>
-
 vsuxseg3ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg3ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x44]
@@ -3324,54 +2124,6 @@ vsseg4e64.v v24, (a0)
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 27 7c 05 62 <unknown>
 
-vsseg4e128.v v24, (a0), v0.t
-# CHECK-INST: vsseg4e128.v v24, (a0), v0.t
-# CHECK-ENCODING: [0x27,0x0c,0x05,0x70]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 0c 05 70 <unknown>
-
-vsseg4e128.v v24, (a0)
-# CHECK-INST: vsseg4e128.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x0c,0x05,0x72]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 0c 05 72 <unknown>
-
-vsseg4e256.v v24, (a0), v0.t
-# CHECK-INST: vsseg4e256.v v24, (a0), v0.t
-# CHECK-ENCODING: [0x27,0x5c,0x05,0x70]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 5c 05 70 <unknown>
-
-vsseg4e256.v v24, (a0)
-# CHECK-INST: vsseg4e256.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x5c,0x05,0x72]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 5c 05 72 <unknown>
-
-vsseg4e512.v v24, (a0), v0.t
-# CHECK-INST: vsseg4e512.v v24, (a0), v0.t
-# CHECK-ENCODING: [0x27,0x6c,0x05,0x70]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 6c 05 70 <unknown>
-
-vsseg4e512.v v24, (a0)
-# CHECK-INST: vsseg4e512.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x6c,0x05,0x72]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 6c 05 72 <unknown>
-
-vsseg4e1024.v v24, (a0), v0.t
-# CHECK-INST: vsseg4e1024.v v24, (a0), v0.t
-# CHECK-ENCODING: [0x27,0x7c,0x05,0x70]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 7c 05 70 <unknown>
-
-vsseg4e1024.v v24, (a0)
-# CHECK-INST: vsseg4e1024.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x7c,0x05,0x72]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 7c 05 72 <unknown>
-
 vssseg4e8.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg4e8.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0x68]
@@ -3420,54 +2172,6 @@ vssseg4e64.v v24, (a0), a1
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 27 7c b5 6a <unknown>
 
-vssseg4e128.v v24, (a0), a1, v0.t
-# CHECK-INST: vssseg4e128.v v24, (a0), a1, v0.t
-# CHECK-ENCODING: [0x27,0x0c,0xb5,0x78]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 0c b5 78 <unknown>
-
-vssseg4e128.v v24, (a0), a1
-# CHECK-INST: vssseg4e128.v v24, (a0), a1
-# CHECK-ENCODING: [0x27,0x0c,0xb5,0x7a]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 0c b5 7a <unknown>
-
-vssseg4e256.v v24, (a0), a1, v0.t
-# CHECK-INST: vssseg4e256.v v24, (a0), a1, v0.t
-# CHECK-ENCODING: [0x27,0x5c,0xb5,0x78]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 5c b5 78 <unknown>
-
-vssseg4e256.v v24, (a0), a1
-# CHECK-INST: vssseg4e256.v v24, (a0), a1
-# CHECK-ENCODING: [0x27,0x5c,0xb5,0x7a]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 5c b5 7a <unknown>
-
-vssseg4e512.v v24, (a0), a1, v0.t
-# CHECK-INST: vssseg4e512.v v24, (a0), a1, v0.t
-# CHECK-ENCODING: [0x27,0x6c,0xb5,0x78]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 6c b5 78 <unknown>
-
-vssseg4e512.v v24, (a0), a1
-# CHECK-INST: vssseg4e512.v v24, (a0), a1
-# CHECK-ENCODING: [0x27,0x6c,0xb5,0x7a]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 6c b5 7a <unknown>
-
-vssseg4e1024.v v24, (a0), a1, v0.t
-# CHECK-INST: vssseg4e1024.v v24, (a0), a1, v0.t
-# CHECK-ENCODING: [0x27,0x7c,0xb5,0x78]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 7c b5 78 <unknown>
-
-vssseg4e1024.v v24, (a0), a1
-# CHECK-INST: vssseg4e1024.v v24, (a0), a1
-# CHECK-ENCODING: [0x27,0x7c,0xb5,0x7a]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 7c b5 7a <unknown>
-
 vsuxseg4ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg4ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x64]
@@ -3612,54 +2316,6 @@ vsseg5e64.v v24, (a0)
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 27 7c 05 82 <unknown>
 
-vsseg5e128.v v24, (a0), v0.t
-# CHECK-INST: vsseg5e128.v v24, (a0), v0.t
-# CHECK-ENCODING: [0x27,0x0c,0x05,0x90]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 0c 05 90 <unknown>
-
-vsseg5e128.v v24, (a0)
-# CHECK-INST: vsseg5e128.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x0c,0x05,0x92]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 0c 05 92 <unknown>
-
-vsseg5e256.v v24, (a0), v0.t
-# CHECK-INST: vsseg5e256.v v24, (a0), v0.t
-# CHECK-ENCODING: [0x27,0x5c,0x05,0x90]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 5c 05 90 <unknown>
-
-vsseg5e256.v v24, (a0)
-# CHECK-INST: vsseg5e256.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x5c,0x05,0x92]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 5c 05 92 <unknown>
-
-vsseg5e512.v v24, (a0), v0.t
-# CHECK-INST: vsseg5e512.v v24, (a0), v0.t
-# CHECK-ENCODING: [0x27,0x6c,0x05,0x90]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 6c 05 90 <unknown>
-
-vsseg5e512.v v24, (a0)
-# CHECK-INST: vsseg5e512.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x6c,0x05,0x92]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 6c 05 92 <unknown>
-
-vsseg5e1024.v v24, (a0), v0.t
-# CHECK-INST: vsseg5e1024.v v24, (a0), v0.t
-# CHECK-ENCODING: [0x27,0x7c,0x05,0x90]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 7c 05 90 <unknown>
-
-vsseg5e1024.v v24, (a0)
-# CHECK-INST: vsseg5e1024.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x7c,0x05,0x92]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 7c 05 92 <unknown>
-
 vssseg5e8.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg5e8.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0x88]
@@ -3708,54 +2364,6 @@ vssseg5e64.v v24, (a0), a1
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 27 7c b5 8a <unknown>
 
-vssseg5e128.v v24, (a0), a1, v0.t
-# CHECK-INST: vssseg5e128.v v24, (a0), a1, v0.t
-# CHECK-ENCODING: [0x27,0x0c,0xb5,0x98]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 0c b5 98 <unknown>
-
-vssseg5e128.v v24, (a0), a1
-# CHECK-INST: vssseg5e128.v v24, (a0), a1
-# CHECK-ENCODING: [0x27,0x0c,0xb5,0x9a]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 0c b5 9a <unknown>
-
-vssseg5e256.v v24, (a0), a1, v0.t
-# CHECK-INST: vssseg5e256.v v24, (a0), a1, v0.t
-# CHECK-ENCODING: [0x27,0x5c,0xb5,0x98]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 5c b5 98 <unknown>
-
-vssseg5e256.v v24, (a0), a1
-# CHECK-INST: vssseg5e256.v v24, (a0), a1
-# CHECK-ENCODING: [0x27,0x5c,0xb5,0x9a]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 5c b5 9a <unknown>
-
-vssseg5e512.v v24, (a0), a1, v0.t
-# CHECK-INST: vssseg5e512.v v24, (a0), a1, v0.t
-# CHECK-ENCODING: [0x27,0x6c,0xb5,0x98]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 6c b5 98 <unknown>
-
-vssseg5e512.v v24, (a0), a1
-# CHECK-INST: vssseg5e512.v v24, (a0), a1
-# CHECK-ENCODING: [0x27,0x6c,0xb5,0x9a]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 6c b5 9a <unknown>
-
-vssseg5e1024.v v24, (a0), a1, v0.t
-# CHECK-INST: vssseg5e1024.v v24, (a0), a1, v0.t
-# CHECK-ENCODING: [0x27,0x7c,0xb5,0x98]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 7c b5 98 <unknown>
-
-vssseg5e1024.v v24, (a0), a1
-# CHECK-INST: vssseg5e1024.v v24, (a0), a1
-# CHECK-ENCODING: [0x27,0x7c,0xb5,0x9a]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 7c b5 9a <unknown>
-
 vsuxseg5ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg5ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x84]
@@ -3900,54 +2508,6 @@ vsseg6e64.v v24, (a0)
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 27 7c 05 a2 <unknown>
 
-vsseg6e128.v v24, (a0), v0.t
-# CHECK-INST: vsseg6e128.v v24, (a0), v0.t
-# CHECK-ENCODING: [0x27,0x0c,0x05,0xb0]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 0c 05 b0 <unknown>
-
-vsseg6e128.v v24, (a0)
-# CHECK-INST: vsseg6e128.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x0c,0x05,0xb2]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 0c 05 b2 <unknown>
-
-vsseg6e256.v v24, (a0), v0.t
-# CHECK-INST: vsseg6e256.v v24, (a0), v0.t
-# CHECK-ENCODING: [0x27,0x5c,0x05,0xb0]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 5c 05 b0 <unknown>
-
-vsseg6e256.v v24, (a0)
-# CHECK-INST: vsseg6e256.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x5c,0x05,0xb2]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 5c 05 b2 <unknown>
-
-vsseg6e512.v v24, (a0), v0.t
-# CHECK-INST: vsseg6e512.v v24, (a0), v0.t
-# CHECK-ENCODING: [0x27,0x6c,0x05,0xb0]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 6c 05 b0 <unknown>
-
-vsseg6e512.v v24, (a0)
-# CHECK-INST: vsseg6e512.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x6c,0x05,0xb2]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 6c 05 b2 <unknown>
-
-vsseg6e1024.v v24, (a0), v0.t
-# CHECK-INST: vsseg6e1024.v v24, (a0), v0.t
-# CHECK-ENCODING: [0x27,0x7c,0x05,0xb0]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 7c 05 b0 <unknown>
-
-vsseg6e1024.v v24, (a0)
-# CHECK-INST: vsseg6e1024.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x7c,0x05,0xb2]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 7c 05 b2 <unknown>
-
 vssseg6e8.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg6e8.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0xa8]
@@ -3996,54 +2556,6 @@ vssseg6e64.v v24, (a0), a1
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 27 7c b5 aa <unknown>
 
-vssseg6e128.v v24, (a0), a1, v0.t
-# CHECK-INST: vssseg6e128.v v24, (a0), a1, v0.t
-# CHECK-ENCODING: [0x27,0x0c,0xb5,0xb8]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 0c b5 b8 <unknown>
-
-vssseg6e128.v v24, (a0), a1
-# CHECK-INST: vssseg6e128.v v24, (a0), a1
-# CHECK-ENCODING: [0x27,0x0c,0xb5,0xba]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 0c b5 ba <unknown>
-
-vssseg6e256.v v24, (a0), a1, v0.t
-# CHECK-INST: vssseg6e256.v v24, (a0), a1, v0.t
-# CHECK-ENCODING: [0x27,0x5c,0xb5,0xb8]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 5c b5 b8 <unknown>
-
-vssseg6e256.v v24, (a0), a1
-# CHECK-INST: vssseg6e256.v v24, (a0), a1
-# CHECK-ENCODING: [0x27,0x5c,0xb5,0xba]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 5c b5 ba <unknown>
-
-vssseg6e512.v v24, (a0), a1, v0.t
-# CHECK-INST: vssseg6e512.v v24, (a0), a1, v0.t
-# CHECK-ENCODING: [0x27,0x6c,0xb5,0xb8]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 6c b5 b8 <unknown>
-
-vssseg6e512.v v24, (a0), a1
-# CHECK-INST: vssseg6e512.v v24, (a0), a1
-# CHECK-ENCODING: [0x27,0x6c,0xb5,0xba]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 6c b5 ba <unknown>
-
-vssseg6e1024.v v24, (a0), a1, v0.t
-# CHECK-INST: vssseg6e1024.v v24, (a0), a1, v0.t
-# CHECK-ENCODING: [0x27,0x7c,0xb5,0xb8]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 7c b5 b8 <unknown>
-
-vssseg6e1024.v v24, (a0), a1
-# CHECK-INST: vssseg6e1024.v v24, (a0), a1
-# CHECK-ENCODING: [0x27,0x7c,0xb5,0xba]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 7c b5 ba <unknown>
-
 vsuxseg6ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg6ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0xa4]
@@ -4188,54 +2700,6 @@ vsseg7e64.v v24, (a0)
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 27 7c 05 c2 <unknown>
 
-vsseg7e128.v v24, (a0), v0.t
-# CHECK-INST: vsseg7e128.v v24, (a0), v0.t
-# CHECK-ENCODING: [0x27,0x0c,0x05,0xd0]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 0c 05 d0 <unknown>
-
-vsseg7e128.v v24, (a0)
-# CHECK-INST: vsseg7e128.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x0c,0x05,0xd2]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 0c 05 d2 <unknown>
-
-vsseg7e256.v v24, (a0), v0.t
-# CHECK-INST: vsseg7e256.v v24, (a0), v0.t
-# CHECK-ENCODING: [0x27,0x5c,0x05,0xd0]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 5c 05 d0 <unknown>
-
-vsseg7e256.v v24, (a0)
-# CHECK-INST: vsseg7e256.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x5c,0x05,0xd2]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 5c 05 d2 <unknown>
-
-vsseg7e512.v v24, (a0), v0.t
-# CHECK-INST: vsseg7e512.v v24, (a0), v0.t
-# CHECK-ENCODING: [0x27,0x6c,0x05,0xd0]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 6c 05 d0 <unknown>
-
-vsseg7e512.v v24, (a0)
-# CHECK-INST: vsseg7e512.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x6c,0x05,0xd2]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 6c 05 d2 <unknown>
-
-vsseg7e1024.v v24, (a0), v0.t
-# CHECK-INST: vsseg7e1024.v v24, (a0), v0.t
-# CHECK-ENCODING: [0x27,0x7c,0x05,0xd0]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 7c 05 d0 <unknown>
-
-vsseg7e1024.v v24, (a0)
-# CHECK-INST: vsseg7e1024.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x7c,0x05,0xd2]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 7c 05 d2 <unknown>
-
 vssseg7e8.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg7e8.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0xc8]
@@ -4284,54 +2748,6 @@ vssseg7e64.v v24, (a0), a1
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 27 7c b5 ca <unknown>
 
-vssseg7e128.v v24, (a0), a1, v0.t
-# CHECK-INST: vssseg7e128.v v24, (a0), a1, v0.t
-# CHECK-ENCODING: [0x27,0x0c,0xb5,0xd8]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 0c b5 d8 <unknown>
-
-vssseg7e128.v v24, (a0), a1
-# CHECK-INST: vssseg7e128.v v24, (a0), a1
-# CHECK-ENCODING: [0x27,0x0c,0xb5,0xda]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 0c b5 da <unknown>
-
-vssseg7e256.v v24, (a0), a1, v0.t
-# CHECK-INST: vssseg7e256.v v24, (a0), a1, v0.t
-# CHECK-ENCODING: [0x27,0x5c,0xb5,0xd8]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 5c b5 d8 <unknown>
-
-vssseg7e256.v v24, (a0), a1
-# CHECK-INST: vssseg7e256.v v24, (a0), a1
-# CHECK-ENCODING: [0x27,0x5c,0xb5,0xda]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 5c b5 da <unknown>
-
-vssseg7e512.v v24, (a0), a1, v0.t
-# CHECK-INST: vssseg7e512.v v24, (a0), a1, v0.t
-# CHECK-ENCODING: [0x27,0x6c,0xb5,0xd8]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 6c b5 d8 <unknown>
-
-vssseg7e512.v v24, (a0), a1
-# CHECK-INST: vssseg7e512.v v24, (a0), a1
-# CHECK-ENCODING: [0x27,0x6c,0xb5,0xda]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 6c b5 da <unknown>
-
-vssseg7e1024.v v24, (a0), a1, v0.t
-# CHECK-INST: vssseg7e1024.v v24, (a0), a1, v0.t
-# CHECK-ENCODING: [0x27,0x7c,0xb5,0xd8]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 7c b5 d8 <unknown>
-
-vssseg7e1024.v v24, (a0), a1
-# CHECK-INST: vssseg7e1024.v v24, (a0), a1
-# CHECK-ENCODING: [0x27,0x7c,0xb5,0xda]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 7c b5 da <unknown>
-
 vsuxseg7ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg7ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0xc4]
@@ -4476,54 +2892,6 @@ vsseg8e64.v v24, (a0)
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 27 7c 05 e2 <unknown>
 
-vsseg8e128.v v24, (a0), v0.t
-# CHECK-INST: vsseg8e128.v v24, (a0), v0.t
-# CHECK-ENCODING: [0x27,0x0c,0x05,0xf0]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 0c 05 f0 <unknown>
-
-vsseg8e128.v v24, (a0)
-# CHECK-INST: vsseg8e128.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x0c,0x05,0xf2]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 0c 05 f2 <unknown>
-
-vsseg8e256.v v24, (a0), v0.t
-# CHECK-INST: vsseg8e256.v v24, (a0), v0.t
-# CHECK-ENCODING: [0x27,0x5c,0x05,0xf0]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 5c 05 f0 <unknown>
-
-vsseg8e256.v v24, (a0)
-# CHECK-INST: vsseg8e256.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x5c,0x05,0xf2]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 5c 05 f2 <unknown>
-
-vsseg8e512.v v24, (a0), v0.t
-# CHECK-INST: vsseg8e512.v v24, (a0), v0.t
-# CHECK-ENCODING: [0x27,0x6c,0x05,0xf0]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 6c 05 f0 <unknown>
-
-vsseg8e512.v v24, (a0)
-# CHECK-INST: vsseg8e512.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x6c,0x05,0xf2]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 6c 05 f2 <unknown>
-
-vsseg8e1024.v v24, (a0), v0.t
-# CHECK-INST: vsseg8e1024.v v24, (a0), v0.t
-# CHECK-ENCODING: [0x27,0x7c,0x05,0xf0]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 7c 05 f0 <unknown>
-
-vsseg8e1024.v v24, (a0)
-# CHECK-INST: vsseg8e1024.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x7c,0x05,0xf2]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 7c 05 f2 <unknown>
-
 vssseg8e8.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg8e8.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0xe8]
@@ -4572,54 +2940,6 @@ vssseg8e64.v v24, (a0), a1
 # CHECK-ERROR: instruction requires the following: 'Zvlsseg'
 # CHECK-UNKNOWN: 27 7c b5 ea <unknown>
 
-vssseg8e128.v v24, (a0), a1, v0.t
-# CHECK-INST: vssseg8e128.v v24, (a0), a1, v0.t
-# CHECK-ENCODING: [0x27,0x0c,0xb5,0xf8]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 0c b5 f8 <unknown>
-
-vssseg8e128.v v24, (a0), a1
-# CHECK-INST: vssseg8e128.v v24, (a0), a1
-# CHECK-ENCODING: [0x27,0x0c,0xb5,0xfa]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 0c b5 fa <unknown>
-
-vssseg8e256.v v24, (a0), a1, v0.t
-# CHECK-INST: vssseg8e256.v v24, (a0), a1, v0.t
-# CHECK-ENCODING: [0x27,0x5c,0xb5,0xf8]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 5c b5 f8 <unknown>
-
-vssseg8e256.v v24, (a0), a1
-# CHECK-INST: vssseg8e256.v v24, (a0), a1
-# CHECK-ENCODING: [0x27,0x5c,0xb5,0xfa]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 5c b5 fa <unknown>
-
-vssseg8e512.v v24, (a0), a1, v0.t
-# CHECK-INST: vssseg8e512.v v24, (a0), a1, v0.t
-# CHECK-ENCODING: [0x27,0x6c,0xb5,0xf8]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 6c b5 f8 <unknown>
-
-vssseg8e512.v v24, (a0), a1
-# CHECK-INST: vssseg8e512.v v24, (a0), a1
-# CHECK-ENCODING: [0x27,0x6c,0xb5,0xfa]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 6c b5 fa <unknown>
-
-vssseg8e1024.v v24, (a0), a1, v0.t
-# CHECK-INST: vssseg8e1024.v v24, (a0), a1, v0.t
-# CHECK-ENCODING: [0x27,0x7c,0xb5,0xf8]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 7c b5 f8 <unknown>
-
-vssseg8e1024.v v24, (a0), a1
-# CHECK-INST: vssseg8e1024.v v24, (a0), a1
-# CHECK-ENCODING: [0x27,0x7c,0xb5,0xfa]
-# CHECK-ERROR: instruction requires the following: 'Zvlsseg'
-# CHECK-UNKNOWN: 27 7c b5 fa <unknown>
-
 vsuxseg8ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg8ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0xe4]

From e5c6c5c16923b2127a92e8a596ade50b97111b03 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 27 Jan 2021 20:34:35 -0800
Subject: [PATCH 089/318] IntrinsicEmitter: Change IntrinsicsToAttributesMap
 from uint8_t[] to uint16_t[]

We need at least 252 UniqAttributes now, which will soon overflow.
Actually with downstream backends we can easily use up the last few values.
So bump to uint16_t.

(cherry picked from commit b7d63244226ba2c0df651622fe7fe3f5f8aba262)
---
 llvm/utils/TableGen/IntrinsicEmitter.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/utils/TableGen/IntrinsicEmitter.cpp b/llvm/utils/TableGen/IntrinsicEmitter.cpp
index 4be0d90a45d2..978d24c8300d 100644
--- a/llvm/utils/TableGen/IntrinsicEmitter.cpp
+++ b/llvm/utils/TableGen/IntrinsicEmitter.cpp
@@ -638,13 +638,13 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
       std::max(maxArgAttrs, unsigned(intrinsic.ArgumentAttributes.size()));
     unsigned &N = UniqAttributes[&intrinsic];
     if (N) continue;
-    assert(AttrNum < 256 && "Too many unique attributes for table!");
     N = ++AttrNum;
+    assert(N < 65536 && "Too many unique attributes for table!");
   }
 
   // Emit an array of AttributeList.  Most intrinsics will have at least one
   // entry, for the function itself (index ~1), which is usually nounwind.
-  OS << "  static const uint8_t IntrinsicsToAttributesMap[] = {\n";
+  OS << "  static const uint16_t IntrinsicsToAttributesMap[] = {\n";
 
   for (unsigned i = 0, e = Ints.size(); i != e; ++i) {
     const CodeGenIntrinsic &intrinsic = Ints[i];

From fbb4aa08510ea87c8000389329b7d8cc9e348b5d Mon Sep 17 00:00:00 2001
From: Jeroen Dobbelaere <jeroen.dobbelaere@synopsys.com>
Date: Mon, 1 Feb 2021 09:23:33 +0100
Subject: [PATCH 090/318] [LoopPeel] Use llvm.experimental.noalias.scope.decl
 for duplicating noalias metadata as needed.

The reduction of a sanitizer build failure when enabling the dominance check (D95335) showed that loop peeling also needs to take care of scope duplication, just like loop unrolling (D92887).

Reviewed By: nikic

Differential Revision: https://reviews.llvm.org/D95544

(cherry picked from commit 80cdd30eb90c3509bf315f1fa1369483e2448bbd)
---
 llvm/lib/Transforms/Utils/LoopPeel.cpp        |  19 ++-
 .../peel-loop-noalias-scope-decl.ll           | 149 ++++++++++++++++++
 2 files changed, 166 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopUnroll/peel-loop-noalias-scope-decl.ll

diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index cb5fee7d28e6..befacb591762 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -509,7 +509,7 @@ static void cloneLoopBlocks(
     SmallVectorImpl<std::pair<BasicBlock *, BasicBlock *>> &ExitEdges,
     SmallVectorImpl<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks,
     ValueToValueMapTy &VMap, ValueToValueMapTy &LVMap, DominatorTree *DT,
-    LoopInfo *LI) {
+    LoopInfo *LI, ArrayRef<MDNode *> LoopLocalNoAliasDeclScopes) {
   BasicBlock *Header = L->getHeader();
   BasicBlock *Latch = L->getLoopLatch();
   BasicBlock *PreHeader = L->getLoopPreheader();
@@ -545,6 +545,15 @@ static void cloneLoopBlocks(
     }
   }
 
+  {
+    // Identify what other metadata depends on the cloned version. After
+    // cloning, replace the metadata with the corrected version for both
+    // memory instructions and noalias intrinsics.
+    std::string Ext = (Twine("Peel") + Twine(IterNumber)).str();
+    cloneAndAdaptNoAliasScopes(LoopLocalNoAliasDeclScopes, NewBlocks,
+                               Header->getContext(), Ext);
+  }
+
   // Recursively create the new Loop objects for nested loops, if any,
   // to preserve LoopInfo.
   for (Loop *ChildLoop : *L) {
@@ -769,13 +778,19 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
   uint64_t ExitWeight = 0, FallThroughWeight = 0;
   initBranchWeights(Header, LatchBR, ExitWeight, FallThroughWeight);
 
+  // Identify what noalias metadata is inside the loop: if it is inside the
+  // loop, the associated metadata must be cloned for each iteration.
+  SmallVector<MDNode *, 6> LoopLocalNoAliasDeclScopes;
+  identifyNoAliasScopesToClone(L->getBlocks(), LoopLocalNoAliasDeclScopes);
+
   // For each peeled-off iteration, make a copy of the loop.
   for (unsigned Iter = 0; Iter < PeelCount; ++Iter) {
     SmallVector<BasicBlock *, 8> NewBlocks;
     ValueToValueMapTy VMap;
 
     cloneLoopBlocks(L, Iter, InsertTop, InsertBot, ExitEdges, NewBlocks,
-                    LoopBlocks, VMap, LVMap, DT, LI);
+                    LoopBlocks, VMap, LVMap, DT, LI,
+                    LoopLocalNoAliasDeclScopes);
 
     // Remap to use values from the current iteration instead of the
     // previous one.
diff --git a/llvm/test/Transforms/LoopUnroll/peel-loop-noalias-scope-decl.ll b/llvm/test/Transforms/LoopUnroll/peel-loop-noalias-scope-decl.ll
new file mode 100644
index 000000000000..3929c5d5bb57
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/peel-loop-noalias-scope-decl.ll
@@ -0,0 +1,149 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -loop-unroll -unroll-force-peel-count=1 | FileCheck %s
+; RUN: opt < %s -S -passes='loop-unroll<peeling;no-runtime>' -unroll-force-peel-count=1 | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Loop peeling must result in valid scope declartions
+
+define internal fastcc void @test01(i8* %p0, i8* %p1, i8* %p2) unnamed_addr align 2 {
+; CHECK-LABEL: @test01(
+; CHECK-NEXT:  for.body47.lr.ph:
+; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata !0)
+; CHECK-NEXT:    br label [[FOR_BODY47_PEEL_BEGIN:%.*]]
+; CHECK:       for.body47.peel.begin:
+; CHECK-NEXT:    br label [[FOR_BODY47_PEEL:%.*]]
+; CHECK:       for.body47.peel:
+; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata !3)
+; CHECK-NEXT:    store i8 42, i8* [[P0:%.*]], align 1, !alias.scope !3
+; CHECK-NEXT:    store i8 43, i8* [[P1:%.*]], align 1, !alias.scope !0
+; CHECK-NEXT:    store i8 44, i8* [[P2:%.*]], align 1, !alias.scope !5
+; CHECK-NEXT:    store i8 42, i8* [[P0]], align 1, !noalias !3
+; CHECK-NEXT:    store i8 43, i8* [[P1]], align 1, !noalias !0
+; CHECK-NEXT:    store i8 44, i8* [[P2]], align 1, !noalias !5
+; CHECK-NEXT:    [[CMP52_PEEL:%.*]] = icmp eq i32 0, 0
+; CHECK-NEXT:    br i1 [[CMP52_PEEL]], label [[COND_TRUE_PEEL:%.*]], label [[COND_END_PEEL:%.*]]
+; CHECK:       cond.true.peel:
+; CHECK-NEXT:    store i8 52, i8* [[P0]], align 1, !alias.scope !3
+; CHECK-NEXT:    store i8 53, i8* [[P1]], align 1, !alias.scope !0
+; CHECK-NEXT:    store i8 54, i8* [[P2]], align 1, !alias.scope !5
+; CHECK-NEXT:    store i8 52, i8* [[P0]], align 1, !noalias !3
+; CHECK-NEXT:    store i8 53, i8* [[P1]], align 1, !noalias !0
+; CHECK-NEXT:    store i8 54, i8* [[P2]], align 1, !noalias !5
+; CHECK-NEXT:    br label [[COND_END_PEEL]]
+; CHECK:       cond.end.peel:
+; CHECK-NEXT:    store i8 62, i8* [[P0]], align 1, !alias.scope !3
+; CHECK-NEXT:    store i8 63, i8* [[P1]], align 1, !alias.scope !0
+; CHECK-NEXT:    store i8 64, i8* [[P2]], align 1, !alias.scope !5
+; CHECK-NEXT:    store i8 62, i8* [[P0]], align 1, !noalias !3
+; CHECK-NEXT:    store i8 63, i8* [[P1]], align 1, !noalias !0
+; CHECK-NEXT:    store i8 64, i8* [[P2]], align 1, !noalias !5
+; CHECK-NEXT:    [[INC_PEEL:%.*]] = add nuw i32 0, 1
+; CHECK-NEXT:    [[EXITCOND_NOT_PEEL:%.*]] = icmp eq i32 [[INC_PEEL]], undef
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_PEEL]], label [[FOR_COND_CLEANUP46:%.*]], label [[FOR_BODY47_PEEL_NEXT:%.*]]
+; CHECK:       for.body47.peel.next:
+; CHECK-NEXT:    br label [[FOR_BODY47_PEEL_NEXT1:%.*]]
+; CHECK:       for.body47.peel.next1:
+; CHECK-NEXT:    br label [[FOR_BODY47_LR_PH_PEEL_NEWPH:%.*]]
+; CHECK:       for.body47.lr.ph.peel.newph:
+; CHECK-NEXT:    br label [[FOR_BODY47:%.*]]
+; CHECK:       for.cond.cleanup46.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP46]]
+; CHECK:       for.cond.cleanup46:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body47:
+; CHECK-NEXT:    [[J_02:%.*]] = phi i32 [ [[INC_PEEL]], [[FOR_BODY47_LR_PH_PEEL_NEWPH]] ], [ [[INC:%.*]], [[COND_END:%.*]] ]
+; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata !6)
+; CHECK-NEXT:    store i8 42, i8* [[P0]], align 1, !alias.scope !6
+; CHECK-NEXT:    store i8 43, i8* [[P1]], align 1, !alias.scope !0
+; CHECK-NEXT:    store i8 44, i8* [[P2]], align 1, !alias.scope !8
+; CHECK-NEXT:    store i8 42, i8* [[P0]], align 1, !noalias !6
+; CHECK-NEXT:    store i8 43, i8* [[P1]], align 1, !noalias !0
+; CHECK-NEXT:    store i8 44, i8* [[P2]], align 1, !noalias !8
+; CHECK-NEXT:    br i1 false, label [[COND_TRUE:%.*]], label [[COND_END]]
+; CHECK:       cond.true:
+; CHECK-NEXT:    store i8 52, i8* [[P0]], align 1, !alias.scope !6
+; CHECK-NEXT:    store i8 53, i8* [[P1]], align 1, !alias.scope !0
+; CHECK-NEXT:    store i8 54, i8* [[P2]], align 1, !alias.scope !8
+; CHECK-NEXT:    store i8 52, i8* [[P0]], align 1, !noalias !6
+; CHECK-NEXT:    store i8 53, i8* [[P1]], align 1, !noalias !0
+; CHECK-NEXT:    store i8 54, i8* [[P2]], align 1, !noalias !8
+; CHECK-NEXT:    br label [[COND_END]]
+; CHECK:       cond.end:
+; CHECK-NEXT:    store i8 62, i8* [[P0]], align 1, !alias.scope !6
+; CHECK-NEXT:    store i8 63, i8* [[P1]], align 1, !alias.scope !0
+; CHECK-NEXT:    store i8 64, i8* [[P2]], align 1, !alias.scope !8
+; CHECK-NEXT:    store i8 62, i8* [[P0]], align 1, !noalias !6
+; CHECK-NEXT:    store i8 63, i8* [[P1]], align 1, !noalias !0
+; CHECK-NEXT:    store i8 64, i8* [[P2]], align 1, !noalias !8
+; CHECK-NEXT:    [[INC]] = add nuw i32 [[J_02]], 1
+; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP46_LOOPEXIT:%.*]], label [[FOR_BODY47]], [[LOOP9:!llvm.loop !.*]]
+;
+for.body47.lr.ph:
+  call void @llvm.experimental.noalias.scope.decl(metadata !5)
+  br label %for.body47
+
+for.cond.cleanup46:                               ; preds = %cond.end
+  ret void
+
+for.body47:                                       ; preds = %cond.end, %for.body47.lr.ph
+  %j.02 = phi i32 [ 0, %for.body47.lr.ph ], [ %inc, %cond.end ]
+  call void @llvm.experimental.noalias.scope.decl(metadata !0)
+  store i8 42, i8* %p0, !alias.scope !0
+  store i8 43, i8* %p1, !alias.scope !5
+  store i8 44, i8* %p2, !alias.scope !7
+  store i8 42, i8* %p0, !noalias !0
+  store i8 43, i8* %p1, !noalias !5
+  store i8 44, i8* %p2, !noalias !7
+  %cmp52 = icmp eq i32 %j.02, 0
+  br i1 %cmp52, label %cond.true, label %cond.end
+
+cond.true:                                        ; preds = %for.body47
+  store i8 52, i8* %p0, !alias.scope !0
+  store i8 53, i8* %p1, !alias.scope !5
+  store i8 54, i8* %p2, !alias.scope !7
+  store i8 52, i8* %p0, !noalias !0
+  store i8 53, i8* %p1, !noalias !5
+  store i8 54, i8* %p2, !noalias !7
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.true, %for.body47
+  store i8 62, i8* %p0, !alias.scope !0
+  store i8 63, i8* %p1, !alias.scope !5
+  store i8 64, i8* %p2, !alias.scope !7
+  store i8 62, i8* %p0, !noalias !0
+  store i8 63, i8* %p1, !noalias !5
+  store i8 64, i8* %p2, !noalias !7
+  %inc = add nuw i32 %j.02, 1
+  %exitcond.not = icmp eq i32 %inc, undef
+  br i1 %exitcond.not, label %for.cond.cleanup46, label %for.body47, !llvm.loop !3
+}
+
+; Function Attrs: inaccessiblememonly nofree nosync nounwind willreturn
+declare void @llvm.experimental.noalias.scope.decl(metadata) #0
+
+attributes #0 = { inaccessiblememonly nofree nosync nounwind willreturn }
+
+!0 = !{!1}
+!1 = distinct !{!1, !2, !"foo: %inner.result"}
+!2 = distinct !{!2, !"foo"}
+!3 = distinct !{!3, !4}
+!4 = !{!"llvm.loop.mustprogress"}
+!5 = !{!6}
+!6 = distinct !{!6, !2, !"foo: %outer.result"}
+!7 = !{!1, !6}
+
+; CHECK: !0 = !{!1}
+; CHECK: !1 = distinct !{!1, !2, !"foo: %outer.result"}
+; CHECK: !2 = distinct !{!2, !"foo"}
+; CHECK: !3 = !{!4}
+; CHECK: !4 = distinct !{!4, !2, !"foo: %inner.result:Peel0"}
+; CHECK: !5 = !{!4, !1}
+; CHECK: !6 = !{!7}
+; CHECK: !7 = distinct !{!7, !2, !"foo: %inner.result"}
+; CHECK: !8 = !{!7, !1}
+; CHECK: !9 = distinct !{!9, !10, !11, !12}
+; CHECK: !10 = !{!"llvm.loop.mustprogress"}
+; CHECK: !11 = !{!"llvm.loop.peeled.count", i32 1}
+; CHECK: !12 = !{!"llvm.loop.unroll.disable"}

From 66b319327bce68377c700b2b57a109498c5500bd Mon Sep 17 00:00:00 2001
From: Jeroen Dobbelaere <jeroen.dobbelaere@synopsys.com>
Date: Tue, 2 Feb 2021 17:55:06 +0100
Subject: [PATCH 091/318] [InlineFunction] Only update noalias scopes once for
 an instruction.

Inlining sometimes maps different instructions to be inlined onto the same instruction.

We must ensure to only remap the noalias scopes once. Otherwise the scope might disappear (at best).
This patch ensures that we only replace scopes for which the mapping is known.

This approach is preferred over tracking which instructions we already handled in a SmallPtrSet,
as that one will need more memory.

Reviewed By: nikic

Differential Revision: https://reviews.llvm.org/D95862

(cherry picked from commit 50c523a9d4402c69d59c0b2ecb383a763d16cde9)
---
 llvm/lib/Transforms/Utils/InlineFunction.cpp | 12 +++++--
 llvm/test/Transforms/Inline/noalias3.ll      | 35 ++++++++++++++++++++
 2 files changed, 44 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/Transforms/Inline/noalias3.ll

diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 0ac8fa537f4e..3026342cc4a6 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -921,14 +921,20 @@ void ScopedAliasMetadataDeepCloner::remap(ValueToValueMapTy &VMap) {
     if (!I)
       continue;
 
+    // Only update scopes when we find them in the map. If they are not, it is
+    // because we already handled that instruction before. This is faster than
+    // tracking which instructions we already updated.
     if (MDNode *M = I->getMetadata(LLVMContext::MD_alias_scope))
-      I->setMetadata(LLVMContext::MD_alias_scope, MDMap[M]);
+      if (MDNode *MNew = MDMap.lookup(M))
+        I->setMetadata(LLVMContext::MD_alias_scope, MNew);
 
     if (MDNode *M = I->getMetadata(LLVMContext::MD_noalias))
-      I->setMetadata(LLVMContext::MD_noalias, MDMap[M]);
+      if (MDNode *MNew = MDMap.lookup(M))
+        I->setMetadata(LLVMContext::MD_noalias, MNew);
 
     if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(I))
-      Decl->setScopeList(MDMap[Decl->getScopeList()]);
+      if (MDNode *MNew = MDMap.lookup(Decl->getScopeList()))
+        Decl->setScopeList(MNew);
   }
 }
 
diff --git a/llvm/test/Transforms/Inline/noalias3.ll b/llvm/test/Transforms/Inline/noalias3.ll
new file mode 100644
index 000000000000..b94cbd6ab72f
--- /dev/null
+++ b/llvm/test/Transforms/Inline/noalias3.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature
+; RUN: opt -inline -S < %s | FileCheck %s
+
+define void @caller(i8* %ptr) {
+; CHECK-LABEL: define {{[^@]+}}@caller
+; CHECK-SAME: (i8* [[PTR:%.*]]) {
+; CHECK-NEXT:    [[I_I:%.*]] = load i8, i8* [[PTR]], align 1, !alias.scope !0
+; CHECK-NEXT:    ret void
+;
+  call void @callee(i8* %ptr)
+  ret void
+}
+
+define void @callee(i8* %ptr) {
+; CHECK-LABEL: define {{[^@]+}}@callee
+; CHECK-SAME: (i8* [[PTR:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[I:%.*]] = load i8, i8* [[PTR]], align 1, !alias.scope !3
+; CHECK-NEXT:    br label [[DUMMY:%.*]]
+; CHECK:       dummy:
+; CHECK-NEXT:    [[I_COPY:%.*]] = phi i8 [ [[I]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %i = load i8, i8* %ptr, !alias.scope !0
+  br label %dummy
+
+dummy:
+  %i.copy = phi i8 [ %i, %entry ]
+  ret void
+}
+
+!0 = !{!1}
+!1 = distinct !{!1, !2}
+!2 = distinct !{!2}

From 54b68d56dc957457b54c700af8e24a2c86539cc3 Mon Sep 17 00:00:00 2001
From: Nathan James <n.james93@hotmail.co.uk>
Date: Wed, 3 Feb 2021 05:11:28 +0000
Subject: [PATCH 092/318] [clang-tidy] Fix crash in
 readability-identifier-naming check

`isParamInMainLikeFunction` didn't check if the function had an identifer name before calling getName() which could lead to an assert.

(cherry picked from commit c97592c5df09850404a9ddbfb614c7df271d1dfe)
---
 .../clang-tidy/readability/IdentifierNamingCheck.cpp          | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp
index d33040a00e15..867b074ca6db 100644
--- a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp
@@ -352,6 +352,10 @@ static bool isParamInMainLikeFunction(const ParmVarDecl &ParmDecl,
     return false;
   if (FDecl->getAccess() != AS_public && FDecl->getAccess() != AS_none)
     return false;
+  // If the function doesn't have a name thats an identifier, can occur of the
+  // function is an operator overload, bail out early.
+  if (!FDecl->getDeclName().isIdentifier())
+    return false;
   enum MainType { None, Main, WMain };
   auto IsCharPtrPtr = [](QualType QType) -> MainType {
     if (QType.isNull())

From 72db3a9104a4f1b80fc26c597423ee7c66bd350d Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Mon, 15 Feb 2021 11:40:39 -0800
Subject: [PATCH 093/318] workflows: Increase the fetch-depth for
 actions/checkout steps

This avoids failures when many commits are pushed close together.
---
 .github/workflows/clang-tests.yml        | 2 +-
 .github/workflows/libclang-abi-tests.yml | 2 +-
 .github/workflows/libclc-tests.yml       | 2 +-
 .github/workflows/lld-tests.yml          | 2 +-
 .github/workflows/lldb-tests.yml         | 2 +-
 .github/workflows/llvm-tests.yml         | 4 ++--
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/clang-tests.yml b/.github/workflows/clang-tests.yml
index af0b5eabeeda..d37637e4b927 100644
--- a/.github/workflows/clang-tests.yml
+++ b/.github/workflows/clang-tests.yml
@@ -35,7 +35,7 @@ jobs:
       uses: llvm/actions/install-ninja@main
     - uses: actions/checkout@v1
       with:
-        fetch-depth: 1
+        fetch-depth: 250
     - name: Test clang
       uses: llvm/actions/build-test-llvm-project@main
       with:
diff --git a/.github/workflows/libclang-abi-tests.yml b/.github/workflows/libclang-abi-tests.yml
index 320a88c1d407..ed54c4a1e54d 100644
--- a/.github/workflows/libclang-abi-tests.yml
+++ b/.github/workflows/libclang-abi-tests.yml
@@ -28,7 +28,7 @@ jobs:
       - name: Checkout source
         uses: actions/checkout@v1
         with:
-          fetch-depth: 1
+          fetch-depth: 250
 
       - name: Get LLVM version
         id: version
diff --git a/.github/workflows/libclc-tests.yml b/.github/workflows/libclc-tests.yml
index 188eecfc3b89..6be01c839f44 100644
--- a/.github/workflows/libclc-tests.yml
+++ b/.github/workflows/libclc-tests.yml
@@ -38,7 +38,7 @@ jobs:
       uses: llvm/actions/install-ninja@main
     - uses: actions/checkout@v1
       with:
-        fetch-depth: 1
+        fetch-depth: 250
     - name: Build clang
       uses: llvm/actions/build-test-llvm-project@main
       with:
diff --git a/.github/workflows/lld-tests.yml b/.github/workflows/lld-tests.yml
index bdf0c2fcd886..1e5540d2fc4d 100644
--- a/.github/workflows/lld-tests.yml
+++ b/.github/workflows/lld-tests.yml
@@ -35,7 +35,7 @@ jobs:
       uses: llvm/actions/install-ninja@main
     - uses: actions/checkout@v1
       with:
-        fetch-depth: 1
+        fetch-depth: 250
     - name: Test lld
       uses: llvm/actions/build-test-llvm-project@main
       with:
diff --git a/.github/workflows/lldb-tests.yml b/.github/workflows/lldb-tests.yml
index 68aec6036995..1658c0e001a0 100644
--- a/.github/workflows/lldb-tests.yml
+++ b/.github/workflows/lldb-tests.yml
@@ -40,7 +40,7 @@ jobs:
       uses: llvm/actions/install-ninja@main
     - uses: actions/checkout@v1
       with:
-        fetch-depth: 1
+        fetch-depth: 250
     - name: Build lldb
       uses: llvm/actions/build-test-llvm-project@main
       with:
diff --git a/.github/workflows/llvm-tests.yml b/.github/workflows/llvm-tests.yml
index 1fcd67a10078..9017a014be02 100644
--- a/.github/workflows/llvm-tests.yml
+++ b/.github/workflows/llvm-tests.yml
@@ -33,7 +33,7 @@ jobs:
       uses: llvm/actions/install-ninja@main
     - uses: actions/checkout@v1
       with:
-        fetch-depth: 1
+        fetch-depth: 250
     - name: Test llvm
       uses: llvm/actions/build-test-llvm-project@main
       with:
@@ -52,7 +52,7 @@ jobs:
       - name: Checkout source
         uses: actions/checkout@v1
         with:
-          fetch-depth: 1
+          fetch-depth: 250
 
       - name: Get LLVM version
         id: version

From 3fe28ce26a3302c799ad731b9b0a8408bf553aef Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Sat, 6 Feb 2021 11:42:02 -0600
Subject: [PATCH 094/318] [AssumptionCache] Do not track llvm.assume calls
 (PR49043)

This fixes PR49043 by invalidating the handle on RAUW. This will work
fine assuming all existing RAUW users add the new assumption to the
cache. That means, if a new llvm.assume call replaces an old one, you
need to add the new one now as a RAUW is not enough anymore.

Reviewed By: nikic

Differential Revision: https://reviews.llvm.org/D96208

(cherry picked from commit 378f4e5ec26c3e0d2119c1112ec645b369eed2de)
---
 llvm/include/llvm/Analysis/AssumptionCache.h |  2 +-
 llvm/test/Transforms/GVNSink/assumption.ll   | 32 ++++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/GVNSink/assumption.ll

diff --git a/llvm/include/llvm/Analysis/AssumptionCache.h b/llvm/include/llvm/Analysis/AssumptionCache.h
index 0ef63dc68e1c..c4602d3449c0 100644
--- a/llvm/include/llvm/Analysis/AssumptionCache.h
+++ b/llvm/include/llvm/Analysis/AssumptionCache.h
@@ -45,7 +45,7 @@ class AssumptionCache {
   enum : unsigned { ExprResultIdx = std::numeric_limits<unsigned>::max() };
 
   struct ResultElem {
-    WeakTrackingVH Assume;
+    WeakVH Assume;
 
     /// contains either ExprResultIdx or the index of the operand bundle
     /// containing the knowledge.
diff --git a/llvm/test/Transforms/GVNSink/assumption.ll b/llvm/test/Transforms/GVNSink/assumption.ll
new file mode 100644
index 000000000000..6b3d832435dc
--- /dev/null
+++ b/llvm/test/Transforms/GVNSink/assumption.ll
@@ -0,0 +1,32 @@
+; RUN: opt < %s -S -passes="print<assumptions>,gvn-sink,loop-unroll" -unroll-count=3 | FileCheck %s
+;
+; This crashed because the cached assumption was replaced and the replacement
+; was then in the cache twice.
+;
+; PR49043
+
+@g = external global i32
+
+define void @main() {
+bb:
+  %i1.i = load volatile i32, i32* @g
+  %i32.i = icmp eq i32 %i1.i, 0
+  call void @llvm.assume(i1 %i32.i) #3
+  br label %bb4.i
+
+bb4.i:                                            ; preds = %bb4.i, %bb
+  %i.i = load volatile i32, i32* @g
+  %i3.i = icmp eq i32 %i.i, 0
+  call void @llvm.assume(i1 %i3.i) #3
+  br label %bb4.i
+
+func_1.exit:                                      ; No predecessors!
+  unreachable
+}
+
+declare void @llvm.assume(i1)
+
+; CHECK:  call void @llvm.assume(
+; CHECK:  call void @llvm.assume(
+; CHECK:  call void @llvm.assume(
+

From 343ba9730b7d2166b5ce61f770413782573ea224 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Tue, 2 Feb 2021 17:24:53 -0600
Subject: [PATCH 095/318] [OpenMP][NFC] Pre-commit test changes regarding
 PR48933

This will highlight the effective changes in subsequent commits.

Reviewed By: ABataev

Differential Revision: https://reviews.llvm.org/D95903

(cherry picked from commit 3b2f19d0bc2803697526191a8a607efa0b38f7e4)
---
 .../nvptx_unsupported_type_messages.cpp       | 128 +++++++++++++++++-
 1 file changed, 126 insertions(+), 2 deletions(-)

diff --git a/clang/test/OpenMP/nvptx_unsupported_type_messages.cpp b/clang/test/OpenMP/nvptx_unsupported_type_messages.cpp
index 814a4756c01b..0601728caefe 100644
--- a/clang/test/OpenMP/nvptx_unsupported_type_messages.cpp
+++ b/clang/test/OpenMP/nvptx_unsupported_type_messages.cpp
@@ -77,11 +77,135 @@ T1 bar1() {
 void baz1() {
   T1 t = bar1();
 }
+
+// TODO: We should not emit an error for dead functions we do not emit.
+inline void dead_inline_declare_target() {
+// expected-note@+1 {{'b' defined here}}
+  long double *a, b = 0;
+// expected-error@+1 {{'b' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
+  a = &b;
+}
+// TODO: We should not emit an error for dead functions we do not emit.
+static void dead_static_declare_target() {
+// expected-note@+1 {{'b' defined here}}
+  long double *a, b = 0;
+// expected-error@+1 {{'b' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
+  a = &b;
+}
+template<bool>
+void dead_template_declare_target() {
+  long double *a, b = 0;
+  a = &b;
+}
+
+// TODO: We should diagnose the return type and argument type here.
+long double ld_return1a() { return 0; }
+void ld_arg1a(long double ld) {}
+
+// TODO: We should diagnose the return type and argument type here.
+typedef long double ld_ty;
+ld_ty ld_return1b() { return 0; }
+void ld_arg1b(ld_ty ld) {}
+
+static long double ld_return1c() { return 0; }
+static void ld_arg1c(long double ld) {}
+
+inline long double ld_return1d() { return 0; }
+inline void ld_arg1d(long double ld) {}
+
+// expected-note@+1 {{'ld_return1e' defined here}}
+static long double ld_return1e() { return 0; }
+// expected-note@+1 {{'ld_arg1e' defined here}}
+static void ld_arg1e(long double ld) {}
+
+// expected-note@+1 {{'ld_return1f' defined here}}
+inline long double ld_return1f() { return 0; }
+// expected-note@+1 {{'ld_arg1f' defined here}}
+inline void ld_arg1f(long double ld) {}
+
+inline void ld_use1() {
+// expected-note@+1 {{'ld' defined here}}
+  long double ld = 0;
+// TODO: We should not diagnose this as the function is dead.
+// expected-error@+1 {{'ld' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
+  ld += 1;
+}
+static void ld_use2() {
+// expected-note@+1 {{'ld' defined here}}
+  long double ld = 0;
+// TODO: We should not diagnose this as the function is dead.
+// expected-error@+1 {{'ld' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
+  ld += 1;
+}
+
+inline void ld_use3() {
+// expected-note@+1 {{'ld' defined here}}
+  long double ld = 0;
+// expected-error@+1 {{'ld' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
+  ld += 1;
+}
+static void ld_use4() {
+// expected-note@+1 {{'ld' defined here}}
+  long double ld = 0;
+// expected-error@+1 {{'ld' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
+  ld += 1;
+}
+
+void external() {
+// expected-error@+1 {{'ld_return1e' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
+  void *p1 = reinterpret_cast<void*>(&ld_return1e);
+// expected-error@+1 {{'ld_arg1e' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
+  void *p2 = reinterpret_cast<void*>(&ld_arg1e);
+// expected-error@+1 {{'ld_return1f' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
+  void *p3 = reinterpret_cast<void*>(&ld_return1f);
+// expected-error@+1 {{'ld_arg1f' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
+  void *p4 = reinterpret_cast<void*>(&ld_arg1f);
+  void *p5 = reinterpret_cast<void*>(&ld_use3);
+  void *p6 = reinterpret_cast<void*>(&ld_use4);
+}
+
+#ifndef _ARCH_PPC
+// TODO: We should diagnose the return type and argument type here.
+__float128 ld_return2a() { return 0; }
+void ld_arg2a(__float128 ld) {}
+
+// TODO: We should diagnose the return type and argument type here.
+typedef __float128 fp128_ty;
+fp128_ty ld_return2b() { return 0; }
+void ld_arg2b(fp128_ty ld) {}
+#endif
+
 #pragma omp end declare target
 
+// TODO: There should not be an error here, dead_inline is never emitted.
+// expected-note@+1 3{{'f' defined here}}
+inline long double dead_inline(long double f) {
+#pragma omp target map(f)
+// TODO: We should not emit the same error message 3 times, here and elsewhere in this file.
+  // expected-error@+1 3{{'f' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
+  f = 1;
+  return f;
+}
+
+// TODO: There should not be an error here, dead_static is never emitted.
+// expected-note@+1 3{{'f' defined here}}
+static long double dead_static(long double f) {
+#pragma omp target map(f)
+  // expected-error@+1 3{{'f' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
+  f = 1;
+  return f;
+}
+
+template<typename T>
+long double dead_template(long double f) {
+#pragma omp target map(f)
+  f = 1;
+  return f;
+}
+
 #ifndef _ARCH_PPC
 // expected-note@+1 3{{'f' defined here}}
-__float128 foo1(__float128 f) {
+__float128 foo2(__float128 f) {
 #pragma omp target map(f)
   // expected-error@+1 3{{'f' requires 128 bit size '__float128' type support, but device 'nvptx64-unknown-unknown' does not support it}}
   f = 1;
@@ -89,7 +213,7 @@ __float128 foo1(__float128 f) {
 }
 #else
 // expected-note@+1 3{{'f' defined here}}
-long double foo1(long double f) {
+long double foo3(long double f) {
 #pragma omp target map(f)
   // expected-error@+1 3{{'f' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
   f = 1;

From 3b9ea2dc8eeb3c3893213f57d532b32ef3619859 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Fri, 29 Jan 2021 02:42:20 -0600
Subject: [PATCH 096/318] [OpenMP] Attribute target diagnostics properly

Type errors in function declarations were not (always) diagnosed prior
to this patch. Furthermore, certain remarks did not get associated
properly which caused them to be emitted multiple times.

Reviewed By: JonChesterfield

Differential Revision: https://reviews.llvm.org/D95912

(cherry picked from commit f9286b434b764b366f1aad9249c04e7741ed5518)
---
 clang/include/clang/Sema/Sema.h               | 16 +++--
 clang/lib/Sema/Sema.cpp                       | 36 ++++++----
 clang/lib/Sema/SemaDecl.cpp                   |  3 +
 clang/lib/Sema/SemaExpr.cpp                   |  2 +-
 clang/lib/Sema/SemaOpenMP.cpp                 | 13 ++--
 .../nvptx_unsupported_type_messages.cpp       | 65 +++++++++++++------
 6 files changed, 87 insertions(+), 48 deletions(-)

diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 7f7c84eb1b1d..42814f6ba8f6 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -11948,8 +11948,8 @@ class Sema final {
   ///  if (diagIfOpenMPDeviceCode(Loc, diag::err_vla_unsupported))
   ///    return ExprError();
   ///  // Otherwise, continue parsing as normal.
-  SemaDiagnosticBuilder diagIfOpenMPDeviceCode(SourceLocation Loc,
-                                               unsigned DiagID);
+  SemaDiagnosticBuilder
+  diagIfOpenMPDeviceCode(SourceLocation Loc, unsigned DiagID, FunctionDecl *FD);
 
   /// Creates a SemaDiagnosticBuilder that emits the diagnostic if the current
   /// context is "used as host code".
@@ -11965,17 +11965,19 @@ class Sema final {
   ///    return ExprError();
   ///  // Otherwise, continue parsing as normal.
   SemaDiagnosticBuilder diagIfOpenMPHostCode(SourceLocation Loc,
-                                             unsigned DiagID);
+                                             unsigned DiagID, FunctionDecl *FD);
 
-  SemaDiagnosticBuilder targetDiag(SourceLocation Loc, unsigned DiagID);
+  SemaDiagnosticBuilder targetDiag(SourceLocation Loc, unsigned DiagID,
+                                   FunctionDecl *FD = nullptr);
   SemaDiagnosticBuilder targetDiag(SourceLocation Loc,
-                                   const PartialDiagnostic &PD) {
-    return targetDiag(Loc, PD.getDiagID()) << PD;
+                                   const PartialDiagnostic &PD,
+                                   FunctionDecl *FD = nullptr) {
+    return targetDiag(Loc, PD.getDiagID(), FD) << PD;
   }
 
   /// Check if the expression is allowed to be used in expressions for the
   /// offloading devices.
-  void checkDeviceDecl(const ValueDecl *D, SourceLocation Loc);
+  void checkDeviceDecl(ValueDecl *D, SourceLocation Loc);
 
   enum CUDAFunctionTarget {
     CFT_Device,
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index cb5a84a31235..450f9c020f7f 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -14,6 +14,7 @@
 #include "UsedDeclVisitor.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/ASTDiagnostic.h"
+#include "clang/AST/Decl.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/DeclFriend.h"
 #include "clang/AST/DeclObjC.h"
@@ -1740,11 +1741,12 @@ Sema::SemaDiagnosticBuilder::~SemaDiagnosticBuilder() {
   }
 }
 
-Sema::SemaDiagnosticBuilder Sema::targetDiag(SourceLocation Loc,
-                                             unsigned DiagID) {
+Sema::SemaDiagnosticBuilder
+Sema::targetDiag(SourceLocation Loc, unsigned DiagID, FunctionDecl *FD) {
+  FD = FD ? FD : getCurFunctionDecl();
   if (LangOpts.OpenMP)
-    return LangOpts.OpenMPIsDevice ? diagIfOpenMPDeviceCode(Loc, DiagID)
-                                   : diagIfOpenMPHostCode(Loc, DiagID);
+    return LangOpts.OpenMPIsDevice ? diagIfOpenMPDeviceCode(Loc, DiagID, FD)
+                                   : diagIfOpenMPHostCode(Loc, DiagID, FD);
   if (getLangOpts().CUDA)
     return getLangOpts().CUDAIsDevice ? CUDADiagIfDeviceCode(Loc, DiagID)
                                       : CUDADiagIfHostCode(Loc, DiagID);
@@ -1753,7 +1755,7 @@ Sema::SemaDiagnosticBuilder Sema::targetDiag(SourceLocation Loc,
     return SYCLDiagIfDeviceCode(Loc, DiagID);
 
   return SemaDiagnosticBuilder(SemaDiagnosticBuilder::K_Immediate, Loc, DiagID,
-                               getCurFunctionDecl(), *this);
+                               FD, *this);
 }
 
 Sema::SemaDiagnosticBuilder Sema::Diag(SourceLocation Loc, unsigned DiagID,
@@ -1772,15 +1774,14 @@ Sema::SemaDiagnosticBuilder Sema::Diag(SourceLocation Loc, unsigned DiagID,
                                  DiagID, getCurFunctionDecl(), *this);
   }
 
-  SemaDiagnosticBuilder DB =
-      getLangOpts().CUDAIsDevice
-          ? CUDADiagIfDeviceCode(Loc, DiagID)
-          : CUDADiagIfHostCode(Loc, DiagID);
+  SemaDiagnosticBuilder DB = getLangOpts().CUDAIsDevice
+                                 ? CUDADiagIfDeviceCode(Loc, DiagID)
+                                 : CUDADiagIfHostCode(Loc, DiagID);
   SetIsLastErrorImmediate(DB.isImmediate());
   return DB;
 }
 
-void Sema::checkDeviceDecl(const ValueDecl *D, SourceLocation Loc) {
+void Sema::checkDeviceDecl(ValueDecl *D, SourceLocation Loc) {
   if (isUnevaluatedContext())
     return;
 
@@ -1798,13 +1799,17 @@ void Sema::checkDeviceDecl(const ValueDecl *D, SourceLocation Loc) {
         return;
   }
 
+  // Try to associate errors with the lexical context, if that is a function, or
+  // the value declaration otherwise.
+  FunctionDecl *FD =
+      isa<FunctionDecl>(C) ? cast<FunctionDecl>(C) : dyn_cast<FunctionDecl>(D);
   auto CheckType = [&](QualType Ty) {
     if (Ty->isDependentType())
       return;
 
     if (Ty->isExtIntType()) {
       if (!Context.getTargetInfo().hasExtIntType()) {
-        targetDiag(Loc, diag::err_device_unsupported_type)
+        targetDiag(Loc, diag::err_device_unsupported_type, FD)
             << D << false /*show bit size*/ << 0 /*bitsize*/
             << Ty << Context.getTargetInfo().getTriple().str();
       }
@@ -1817,11 +1822,12 @@ void Sema::checkDeviceDecl(const ValueDecl *D, SourceLocation Loc) {
          !Context.getTargetInfo().hasFloat128Type()) ||
         (Ty->isIntegerType() && Context.getTypeSize(Ty) == 128 &&
          !Context.getTargetInfo().hasInt128Type())) {
-      targetDiag(Loc, diag::err_device_unsupported_type)
+      if (targetDiag(Loc, diag::err_device_unsupported_type, FD)
           << D << true /*show bit size*/
           << static_cast<unsigned>(Context.getTypeSize(Ty)) << Ty
-          << Context.getTargetInfo().getTriple().str();
-      targetDiag(D->getLocation(), diag::note_defined_here) << D;
+          << Context.getTargetInfo().getTriple().str())
+        D->setInvalidDecl();
+      targetDiag(D->getLocation(), diag::note_defined_here, FD) << D;
     }
   };
 
@@ -1833,6 +1839,8 @@ void Sema::checkDeviceDecl(const ValueDecl *D, SourceLocation Loc) {
       CheckType(ParamTy);
     CheckType(FPTy->getReturnType());
   }
+  if (const auto *FNPTy = dyn_cast<FunctionNoProtoType>(Ty))
+    CheckType(FNPTy->getReturnType());
 }
 
 /// Looks through the macro-expansion chain for the given
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 3ee0c43097d7..6457c6d024cf 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -9420,6 +9420,9 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
     }
   }
 
+  if (LangOpts.SYCLIsDevice || (LangOpts.OpenMP && LangOpts.OpenMPIsDevice))
+    checkDeviceDecl(NewFD, D.getBeginLoc());
+
   if (!getLangOpts().CPlusPlus) {
     // Perform semantic checking on the function declaration.
     if (!NewFD->isInvalidDecl() && NewFD->isMain())
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 45616dadcbee..ae8508d6c601 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -373,7 +373,7 @@ bool Sema::DiagnoseUseOfDecl(NamedDecl *D, ArrayRef<SourceLocation> Locs,
   }
 
   if (LangOpts.SYCLIsDevice || (LangOpts.OpenMP && LangOpts.OpenMPIsDevice)) {
-    if (const auto *VD = dyn_cast<ValueDecl>(D))
+    if (auto *VD = dyn_cast<ValueDecl>(D))
       checkDeviceDecl(VD, Loc);
 
     if (!Context.getTargetInfo().isTLSSupported())
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 78707484f588..596aa4b7b5c1 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -1898,11 +1898,11 @@ enum class FunctionEmissionStatus {
 } // anonymous namespace
 
 Sema::SemaDiagnosticBuilder Sema::diagIfOpenMPDeviceCode(SourceLocation Loc,
-                                                         unsigned DiagID) {
+                                                         unsigned DiagID,
+                                                         FunctionDecl *FD) {
   assert(LangOpts.OpenMP && LangOpts.OpenMPIsDevice &&
          "Expected OpenMP device compilation.");
 
-  FunctionDecl *FD = getCurFunctionDecl();
   SemaDiagnosticBuilder::Kind Kind = SemaDiagnosticBuilder::K_Nop;
   if (FD) {
     FunctionEmissionStatus FES = getEmissionStatus(FD);
@@ -1925,14 +1925,15 @@ Sema::SemaDiagnosticBuilder Sema::diagIfOpenMPDeviceCode(SourceLocation Loc,
     }
   }
 
-  return SemaDiagnosticBuilder(Kind, Loc, DiagID, getCurFunctionDecl(), *this);
+  return SemaDiagnosticBuilder(Kind, Loc, DiagID, FD, *this);
 }
 
 Sema::SemaDiagnosticBuilder Sema::diagIfOpenMPHostCode(SourceLocation Loc,
-                                                       unsigned DiagID) {
+                                                       unsigned DiagID,
+                                                       FunctionDecl *FD) {
   assert(LangOpts.OpenMP && !LangOpts.OpenMPIsDevice &&
          "Expected OpenMP host compilation.");
-  FunctionEmissionStatus FES = getEmissionStatus(getCurFunctionDecl());
+  FunctionEmissionStatus FES = getEmissionStatus(FD);
   SemaDiagnosticBuilder::Kind Kind = SemaDiagnosticBuilder::K_Nop;
   switch (FES) {
   case FunctionEmissionStatus::Emitted:
@@ -1948,7 +1949,7 @@ Sema::SemaDiagnosticBuilder Sema::diagIfOpenMPHostCode(SourceLocation Loc,
     break;
   }
 
-  return SemaDiagnosticBuilder(Kind, Loc, DiagID, getCurFunctionDecl(), *this);
+  return SemaDiagnosticBuilder(Kind, Loc, DiagID, FD, *this);
 }
 
 static OpenMPDefaultmapClauseKind
diff --git a/clang/test/OpenMP/nvptx_unsupported_type_messages.cpp b/clang/test/OpenMP/nvptx_unsupported_type_messages.cpp
index 0601728caefe..1b89a891887d 100644
--- a/clang/test/OpenMP/nvptx_unsupported_type_messages.cpp
+++ b/clang/test/OpenMP/nvptx_unsupported_type_messages.cpp
@@ -39,10 +39,12 @@ struct T1 {
 };
 
 #ifndef _ARCH_PPC
-// expected-note@+1 {{'boo' defined here}}
+// expected-error@+2 {{'boo' requires 128 bit size '__float128' type support, but device 'nvptx64-unknown-unknown' does not support it}}
+// expected-note@+1 2{{'boo' defined here}}
 void boo(__float128 A) { return; }
 #else
-// expected-note@+1 {{'boo' defined here}}
+// expected-error@+2 {{'boo' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
+// expected-note@+1 2{{'boo' defined here}}
 void boo(long double A) { return; }
 #endif
 #pragma omp declare target
@@ -51,10 +53,11 @@ T f = a;
 void foo(T a = T()) {
   a = a + f; // expected-note {{called by 'foo'}}
 #ifndef _ARCH_PPC
-// expected-error@+4 {{'boo' requires 128 bit size '__float128' type support, but device 'nvptx64-unknown-unknown' does not support it}}
+// expected-error@+5 {{'boo' requires 128 bit size '__float128' type support, but device 'nvptx64-unknown-unknown' does not support it}}
 #else
-// expected-error@+2 {{'boo' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
+// expected-error@+3 {{'boo' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
 #endif
+// expected-note@+1 {{called by 'foo'}}
   boo(0);
   return;
 }
@@ -98,28 +101,49 @@ void dead_template_declare_target() {
   a = &b;
 }
 
-// TODO: We should diagnose the return type and argument type here.
+// expected-note@+2 {{'ld_return1a' defined here}}
+// expected-error@+1 {{'ld_return1a' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
 long double ld_return1a() { return 0; }
+// expected-note@+2 {{'ld_arg1a' defined here}}
+// expected-error@+1 {{'ld_arg1a' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
 void ld_arg1a(long double ld) {}
 
 // TODO: We should diagnose the return type and argument type here.
 typedef long double ld_ty;
+// expected-note@+2 {{'ld_return1b' defined here}}
+// expected-error@+1 {{'ld_return1b' requires 128 bit size 'ld_ty' (aka 'long double') type support, but device 'nvptx64-unknown-unknown' does not support it}}
 ld_ty ld_return1b() { return 0; }
+// expected-note@+2 {{'ld_arg1b' defined here}}
+// expected-error@+1 {{'ld_arg1b' requires 128 bit size 'ld_ty' (aka 'long double') type support, but device 'nvptx64-unknown-unknown' does not support it}}
 void ld_arg1b(ld_ty ld) {}
 
+// TODO: These errors should not be emitted.
+// expected-note@+2 {{'ld_return1c' defined here}}
+// expected-error@+1 {{'ld_return1c' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
 static long double ld_return1c() { return 0; }
+// expected-note@+2 {{'ld_arg1c' defined here}}
+// expected-error@+1 {{'ld_arg1c' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
 static void ld_arg1c(long double ld) {}
 
+// TODO: These errors should not be emitted.
+// expected-note@+2 {{'ld_return1d' defined here}}
+// expected-error@+1 {{'ld_return1d' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
 inline long double ld_return1d() { return 0; }
+// expected-note@+2 {{'ld_arg1d' defined here}}
+// expected-error@+1 {{'ld_arg1d' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
 inline void ld_arg1d(long double ld) {}
 
+// expected-error@+2 {{'ld_return1e' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
 // expected-note@+1 {{'ld_return1e' defined here}}
 static long double ld_return1e() { return 0; }
+// expected-error@+2 {{'ld_arg1e' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
 // expected-note@+1 {{'ld_arg1e' defined here}}
 static void ld_arg1e(long double ld) {}
 
+// expected-error@+2 {{'ld_return1f' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
 // expected-note@+1 {{'ld_return1f' defined here}}
 inline long double ld_return1f() { return 0; }
+// expected-error@+2 {{'ld_arg1f' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
 // expected-note@+1 {{'ld_arg1f' defined here}}
 inline void ld_arg1f(long double ld) {}
 
@@ -152,46 +176,47 @@ static void ld_use4() {
 }
 
 void external() {
-// expected-error@+1 {{'ld_return1e' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
   void *p1 = reinterpret_cast<void*>(&ld_return1e);
-// expected-error@+1 {{'ld_arg1e' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
   void *p2 = reinterpret_cast<void*>(&ld_arg1e);
-// expected-error@+1 {{'ld_return1f' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
   void *p3 = reinterpret_cast<void*>(&ld_return1f);
-// expected-error@+1 {{'ld_arg1f' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
   void *p4 = reinterpret_cast<void*>(&ld_arg1f);
   void *p5 = reinterpret_cast<void*>(&ld_use3);
   void *p6 = reinterpret_cast<void*>(&ld_use4);
 }
 
 #ifndef _ARCH_PPC
-// TODO: We should diagnose the return type and argument type here.
+// expected-note@+2 {{'ld_return2a' defined here}}
+// expected-error@+1 {{'ld_return2a' requires 128 bit size '__float128' type support, but device 'nvptx64-unknown-unknown' does not support it}}
 __float128 ld_return2a() { return 0; }
+// expected-note@+2 {{'ld_arg2a' defined here}}
+// expected-error@+1 {{'ld_arg2a' requires 128 bit size '__float128' type support, but device 'nvptx64-unknown-unknown' does not support it}}
 void ld_arg2a(__float128 ld) {}
 
-// TODO: We should diagnose the return type and argument type here.
 typedef __float128 fp128_ty;
+// expected-note@+2 {{'ld_return2b' defined here}}
+// expected-error@+1 {{'ld_return2b' requires 128 bit size 'fp128_ty' (aka '__float128') type support, but device 'nvptx64-unknown-unknown' does not support it}}
 fp128_ty ld_return2b() { return 0; }
+// expected-note@+2 {{'ld_arg2b' defined here}}
+// expected-error@+1 {{'ld_arg2b' requires 128 bit size 'fp128_ty' (aka '__float128') type support, but device 'nvptx64-unknown-unknown' does not support it}}
 void ld_arg2b(fp128_ty ld) {}
 #endif
 
 #pragma omp end declare target
 
 // TODO: There should not be an error here, dead_inline is never emitted.
-// expected-note@+1 3{{'f' defined here}}
+// expected-note@+1 {{'f' defined here}}
 inline long double dead_inline(long double f) {
 #pragma omp target map(f)
-// TODO: We should not emit the same error message 3 times, here and elsewhere in this file.
-  // expected-error@+1 3{{'f' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
+  // expected-error@+1 {{'f' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
   f = 1;
   return f;
 }
 
 // TODO: There should not be an error here, dead_static is never emitted.
-// expected-note@+1 3{{'f' defined here}}
+// expected-note@+1 {{'f' defined here}}
 static long double dead_static(long double f) {
 #pragma omp target map(f)
-  // expected-error@+1 3{{'f' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
+  // expected-error@+1 {{'f' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
   f = 1;
   return f;
 }
@@ -204,18 +229,18 @@ long double dead_template(long double f) {
 }
 
 #ifndef _ARCH_PPC
-// expected-note@+1 3{{'f' defined here}}
+// expected-note@+1 {{'f' defined here}}
 __float128 foo2(__float128 f) {
 #pragma omp target map(f)
-  // expected-error@+1 3{{'f' requires 128 bit size '__float128' type support, but device 'nvptx64-unknown-unknown' does not support it}}
+  // expected-error@+1 {{'f' requires 128 bit size '__float128' type support, but device 'nvptx64-unknown-unknown' does not support it}}
   f = 1;
   return f;
 }
 #else
-// expected-note@+1 3{{'f' defined here}}
+// expected-note@+1 {{'f' defined here}}
 long double foo3(long double f) {
 #pragma omp target map(f)
-  // expected-error@+1 3{{'f' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
+  // expected-error@+1 {{'f' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
   f = 1;
   return f;
 }

From 0d14528f8082ba12e34f41cbf931a7e7425b694a Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Tue, 2 Feb 2021 11:17:44 -0600
Subject: [PATCH 097/318] [OpenMP] Delay more diagnostics of potentially
 non-emitted code

Even code in target and declare target regions might not be emitted.
With this patch we delay more diagnostics and use laziness and linkage
to determine if a function is emitted (for the device). Note that we
still eagerly emit diagnostics for target regions, unfortunately, see
the TODO for the reason.

This hopefully fixes PR48933.

Reviewed By: JonChesterfield

Differential Revision: https://reviews.llvm.org/D95928

(cherry picked from commit 1dd66e6111a8247c6c7931143251c0cf1442b905)
---
 clang/lib/Sema/SemaDecl.cpp                   | 85 +++++++++----------
 clang/lib/Sema/SemaOpenMP.cpp                 | 10 ++-
 clang/test/OpenMP/nvptx_allocate_messages.cpp |  3 +-
 .../nvptx_target_exceptions_messages.cpp      |  1 +
 .../nvptx_unsupported_type_messages.cpp       | 34 ++------
 5 files changed, 59 insertions(+), 74 deletions(-)

diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 6457c6d024cf..1f7ab49ccdd7 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -18332,42 +18332,51 @@ Sema::FunctionEmissionStatus Sema::getEmissionStatus(FunctionDecl *FD,
   if (FD->isDependentContext())
     return FunctionEmissionStatus::TemplateDiscarded;
 
-  FunctionEmissionStatus OMPES = FunctionEmissionStatus::Unknown;
+  // Check whether this function is an externally visible definition.
+  auto IsEmittedForExternalSymbol = [this, FD]() {
+    // We have to check the GVA linkage of the function's *definition* -- if we
+    // only have a declaration, we don't know whether or not the function will
+    // be emitted, because (say) the definition could include "inline".
+    FunctionDecl *Def = FD->getDefinition();
+
+    return Def && !isDiscardableGVALinkage(
+                      getASTContext().GetGVALinkageForFunction(Def));
+  };
+
   if (LangOpts.OpenMPIsDevice) {
+    // In OpenMP device mode we will not emit host only functions, or functions
+    // we don't need due to their linkage.
     Optional<OMPDeclareTargetDeclAttr::DevTypeTy> DevTy =
         OMPDeclareTargetDeclAttr::getDeviceType(FD->getCanonicalDecl());
-    if (DevTy.hasValue()) {
+    // DevTy may be changed later by
+    //  #pragma omp declare target to(*) device_type(*).
+    // Therefore DevTyhaving no value does not imply host. The emission status
+    // will be checked again at the end of compilation unit with Final = true.
+    if (DevTy.hasValue())
       if (*DevTy == OMPDeclareTargetDeclAttr::DT_Host)
-        OMPES = FunctionEmissionStatus::OMPDiscarded;
-      else if (*DevTy == OMPDeclareTargetDeclAttr::DT_NoHost ||
-               *DevTy == OMPDeclareTargetDeclAttr::DT_Any) {
-        OMPES = FunctionEmissionStatus::Emitted;
-      }
-    }
-  } else if (LangOpts.OpenMP) {
-    // In OpenMP 4.5 all the functions are host functions.
-    if (LangOpts.OpenMP <= 45) {
-      OMPES = FunctionEmissionStatus::Emitted;
-    } else {
-      Optional<OMPDeclareTargetDeclAttr::DevTypeTy> DevTy =
-          OMPDeclareTargetDeclAttr::getDeviceType(FD->getCanonicalDecl());
-      // In OpenMP 5.0 or above, DevTy may be changed later by
-      // #pragma omp declare target to(*) device_type(*). Therefore DevTy
-      // having no value does not imply host. The emission status will be
-      // checked again at the end of compilation unit.
-      if (DevTy.hasValue()) {
-        if (*DevTy == OMPDeclareTargetDeclAttr::DT_NoHost) {
-          OMPES = FunctionEmissionStatus::OMPDiscarded;
-        } else if (*DevTy == OMPDeclareTargetDeclAttr::DT_Host ||
-                   *DevTy == OMPDeclareTargetDeclAttr::DT_Any)
-          OMPES = FunctionEmissionStatus::Emitted;
-      } else if (Final)
-        OMPES = FunctionEmissionStatus::Emitted;
-    }
-  }
-  if (OMPES == FunctionEmissionStatus::OMPDiscarded ||
-      (OMPES == FunctionEmissionStatus::Emitted && !LangOpts.CUDA))
-    return OMPES;
+        return FunctionEmissionStatus::OMPDiscarded;
+    // If we have an explicit value for the device type, or we are in a target
+    // declare context, we need to emit all extern and used symbols.
+    if (isInOpenMPDeclareTargetContext() || DevTy.hasValue())
+      if (IsEmittedForExternalSymbol())
+        return FunctionEmissionStatus::Emitted;
+    // Device mode only emits what it must, if it wasn't tagged yet and needed,
+    // we'll omit it.
+    if (Final)
+      return FunctionEmissionStatus::OMPDiscarded;
+  } else if (LangOpts.OpenMP > 45) {
+    // In OpenMP host compilation prior to 5.0 everything was an emitted host
+    // function. In 5.0, no_host was introduced which might cause a function to
+    // be ommitted.
+    Optional<OMPDeclareTargetDeclAttr::DevTypeTy> DevTy =
+        OMPDeclareTargetDeclAttr::getDeviceType(FD->getCanonicalDecl());
+    if (DevTy.hasValue())
+      if (*DevTy == OMPDeclareTargetDeclAttr::DT_NoHost)
+        return FunctionEmissionStatus::OMPDiscarded;
+  }
+
+  if (Final && LangOpts.OpenMP && !LangOpts.CUDA)
+    return FunctionEmissionStatus::Emitted;
 
   if (LangOpts.CUDA) {
     // When compiling for device, host functions are never emitted.  Similarly,
@@ -18381,17 +18390,7 @@ Sema::FunctionEmissionStatus Sema::getEmissionStatus(FunctionDecl *FD,
         (T == Sema::CFT_Device || T == Sema::CFT_Global))
       return FunctionEmissionStatus::CUDADiscarded;
 
-    // Check whether this function is externally visible -- if so, it's
-    // known-emitted.
-    //
-    // We have to check the GVA linkage of the function's *definition* -- if we
-    // only have a declaration, we don't know whether or not the function will
-    // be emitted, because (say) the definition could include "inline".
-    FunctionDecl *Def = FD->getDefinition();
-
-    if (Def &&
-        !isDiscardableGVALinkage(getASTContext().GetGVALinkageForFunction(Def))
-        && (!LangOpts.OpenMP || OMPES == FunctionEmissionStatus::Emitted))
+    if (IsEmittedForExternalSymbol())
       return FunctionEmissionStatus::Emitted;
   }
 
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 596aa4b7b5c1..4063c185388d 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -1884,8 +1884,7 @@ void Sema::popOpenMPFunctionRegion(const FunctionScopeInfo *OldFSI) {
 static bool isOpenMPDeviceDelayedContext(Sema &S) {
   assert(S.LangOpts.OpenMP && S.LangOpts.OpenMPIsDevice &&
          "Expected OpenMP device compilation.");
-  return !S.isInOpenMPTargetExecutionDirective() &&
-         !S.isInOpenMPDeclareTargetContext();
+  return !S.isInOpenMPTargetExecutionDirective();
 }
 
 namespace {
@@ -1911,6 +1910,13 @@ Sema::SemaDiagnosticBuilder Sema::diagIfOpenMPDeviceCode(SourceLocation Loc,
       Kind = SemaDiagnosticBuilder::K_Immediate;
       break;
     case FunctionEmissionStatus::Unknown:
+      // TODO: We should always delay diagnostics here in case a target
+      //       region is in a function we do not emit. However, as the
+      //       current diagnostics are associated with the function containing
+      //       the target region and we do not emit that one, we would miss out
+      //       on diagnostics for the target region itself. We need to anchor
+      //       the diagnostics with the new generated function *or* ensure we
+      //       emit diagnostics associated with the surrounding function.
       Kind = isOpenMPDeviceDelayedContext(*this)
                  ? SemaDiagnosticBuilder::K_Deferred
                  : SemaDiagnosticBuilder::K_Immediate;
diff --git a/clang/test/OpenMP/nvptx_allocate_messages.cpp b/clang/test/OpenMP/nvptx_allocate_messages.cpp
index a4d78b6ab588..9a61da73eb39 100644
--- a/clang/test/OpenMP/nvptx_allocate_messages.cpp
+++ b/clang/test/OpenMP/nvptx_allocate_messages.cpp
@@ -81,8 +81,7 @@ int main () {
 #endif // DEVICE && !REQUIRES
 #pragma omp allocate(b)
 #if defined(DEVICE) && !defined(REQUIRES)
-// expected-note@+3 {{in instantiation of function template specialization 'foo<int>' requested here}}
-// expected-note@+2 {{called by 'main'}}
+// expected-note@+2 2{{called by 'main'}}
 #endif // DEVICE && !REQUIRES
   return (foo<int>() + bar());
 }
diff --git a/clang/test/OpenMP/nvptx_target_exceptions_messages.cpp b/clang/test/OpenMP/nvptx_target_exceptions_messages.cpp
index c71615d2521f..87ea00a90822 100644
--- a/clang/test/OpenMP/nvptx_target_exceptions_messages.cpp
+++ b/clang/test/OpenMP/nvptx_target_exceptions_messages.cpp
@@ -52,6 +52,7 @@ int maini1() {
 #pragma omp target map(tofrom \
                        : a, b)
   {
+    // expected-note@+1 {{called by 'maini1'}}
     S s(a);
     static long aaa = 23;
     a = foo() + bar() + b + c + d + aa + aaa + FA<int>(); // expected-note{{called by 'maini1'}}
diff --git a/clang/test/OpenMP/nvptx_unsupported_type_messages.cpp b/clang/test/OpenMP/nvptx_unsupported_type_messages.cpp
index 1b89a891887d..a319c78f73c5 100644
--- a/clang/test/OpenMP/nvptx_unsupported_type_messages.cpp
+++ b/clang/test/OpenMP/nvptx_unsupported_type_messages.cpp
@@ -81,18 +81,12 @@ void baz1() {
   T1 t = bar1();
 }
 
-// TODO: We should not emit an error for dead functions we do not emit.
 inline void dead_inline_declare_target() {
-// expected-note@+1 {{'b' defined here}}
   long double *a, b = 0;
-// expected-error@+1 {{'b' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
   a = &b;
 }
-// TODO: We should not emit an error for dead functions we do not emit.
 static void dead_static_declare_target() {
-// expected-note@+1 {{'b' defined here}}
   long double *a, b = 0;
-// expected-error@+1 {{'b' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
   a = &b;
 }
 template<bool>
@@ -108,7 +102,6 @@ long double ld_return1a() { return 0; }
 // expected-error@+1 {{'ld_arg1a' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
 void ld_arg1a(long double ld) {}
 
-// TODO: We should diagnose the return type and argument type here.
 typedef long double ld_ty;
 // expected-note@+2 {{'ld_return1b' defined here}}
 // expected-error@+1 {{'ld_return1b' requires 128 bit size 'ld_ty' (aka 'long double') type support, but device 'nvptx64-unknown-unknown' does not support it}}
@@ -117,48 +110,28 @@ ld_ty ld_return1b() { return 0; }
 // expected-error@+1 {{'ld_arg1b' requires 128 bit size 'ld_ty' (aka 'long double') type support, but device 'nvptx64-unknown-unknown' does not support it}}
 void ld_arg1b(ld_ty ld) {}
 
-// TODO: These errors should not be emitted.
-// expected-note@+2 {{'ld_return1c' defined here}}
-// expected-error@+1 {{'ld_return1c' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
 static long double ld_return1c() { return 0; }
-// expected-note@+2 {{'ld_arg1c' defined here}}
-// expected-error@+1 {{'ld_arg1c' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
 static void ld_arg1c(long double ld) {}
 
-// TODO: These errors should not be emitted.
-// expected-note@+2 {{'ld_return1d' defined here}}
-// expected-error@+1 {{'ld_return1d' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
 inline long double ld_return1d() { return 0; }
-// expected-note@+2 {{'ld_arg1d' defined here}}
-// expected-error@+1 {{'ld_arg1d' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
 inline void ld_arg1d(long double ld) {}
 
-// expected-error@+2 {{'ld_return1e' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
 // expected-note@+1 {{'ld_return1e' defined here}}
 static long double ld_return1e() { return 0; }
-// expected-error@+2 {{'ld_arg1e' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
 // expected-note@+1 {{'ld_arg1e' defined here}}
 static void ld_arg1e(long double ld) {}
 
-// expected-error@+2 {{'ld_return1f' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
 // expected-note@+1 {{'ld_return1f' defined here}}
 inline long double ld_return1f() { return 0; }
-// expected-error@+2 {{'ld_arg1f' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
 // expected-note@+1 {{'ld_arg1f' defined here}}
 inline void ld_arg1f(long double ld) {}
 
 inline void ld_use1() {
-// expected-note@+1 {{'ld' defined here}}
   long double ld = 0;
-// TODO: We should not diagnose this as the function is dead.
-// expected-error@+1 {{'ld' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
   ld += 1;
 }
 static void ld_use2() {
-// expected-note@+1 {{'ld' defined here}}
   long double ld = 0;
-// TODO: We should not diagnose this as the function is dead.
-// expected-error@+1 {{'ld' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
   ld += 1;
 }
 
@@ -176,11 +149,18 @@ static void ld_use4() {
 }
 
 void external() {
+// expected-error@+1 {{'ld_return1e' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
   void *p1 = reinterpret_cast<void*>(&ld_return1e);
+// expected-error@+1 {{'ld_arg1e' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
   void *p2 = reinterpret_cast<void*>(&ld_arg1e);
+// expected-error@+1 {{'ld_return1f' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
   void *p3 = reinterpret_cast<void*>(&ld_return1f);
+// expected-error@+1 {{'ld_arg1f' requires 128 bit size 'long double' type support, but device 'nvptx64-unknown-unknown' does not support it}}
   void *p4 = reinterpret_cast<void*>(&ld_arg1f);
+// TODO: The error message "called by" is not great.
+// expected-note@+1 {{called by 'external'}}
   void *p5 = reinterpret_cast<void*>(&ld_use3);
+// expected-note@+1 {{called by 'external'}}
   void *p6 = reinterpret_cast<void*>(&ld_use4);
 }
 

From d14016d869acac0d0196bd6b846ab45879ea0fa5 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dimitry@andric.com>
Date: Mon, 15 Feb 2021 18:22:01 +0100
Subject: [PATCH 098/318] Define new/delete in libc++ when using libcxxrt

Always turn on LIBCXX_ENABLE_NEW_DELETE_DEFINITIONS, if libcxxrt is used
as the C++ ABI library, since libcxxrt does not provide the full set
ofnew and delete operators. In particular, the aligned versions of these
operators are completely missing. This primarily addresses builds on
FreeBSD, as this platform uses libcxxrt by default.

Also, attempt to provide a FreeBSD.cmake cache file, with hopefully sane
settings, partially copied from the Apple.cmake cache file. This needs
more work, probably some additions to ci build scripts (although I am
not aware of any 'official' FreeBSD build bots).

Reviewed By: ldionne, #libc

Differential Revision: https://reviews.llvm.org/D96720

(cherry picked from commit 328261019f50a76b11fa625739cbf32ceb2ce2f7)
---
 libcxx/cmake/Modules/HandleLibCXXABI.cmake | 2 ++
 libcxx/cmake/caches/FreeBSD.cmake          | 9 +++++++++
 2 files changed, 11 insertions(+)
 create mode 100644 libcxx/cmake/caches/FreeBSD.cmake

diff --git a/libcxx/cmake/Modules/HandleLibCXXABI.cmake b/libcxx/cmake/Modules/HandleLibCXXABI.cmake
index c5aa26739e36..5d2764e870e9 100644
--- a/libcxx/cmake/Modules/HandleLibCXXABI.cmake
+++ b/libcxx/cmake/Modules/HandleLibCXXABI.cmake
@@ -121,6 +121,8 @@ elseif ("${LIBCXX_CXX_ABI_LIBNAME}" STREQUAL "libcxxrt")
   if(NOT LIBCXX_CXX_ABI_INCLUDE_PATHS)
     set(LIBCXX_CXX_ABI_INCLUDE_PATHS "/usr/include/c++/v1")
   endif()
+  # libcxxrt does not provide aligned new and delete operators
+  set(LIBCXX_ENABLE_NEW_DELETE_DEFINITIONS ON)
   setup_abi_lib(
     "-DLIBCXXRT"
     "cxxrt" "cxxrt" "cxxabi.h;unwind.h;unwind-arm.h;unwind-itanium.h" ""
diff --git a/libcxx/cmake/caches/FreeBSD.cmake b/libcxx/cmake/caches/FreeBSD.cmake
new file mode 100644
index 000000000000..9e66e379864b
--- /dev/null
+++ b/libcxx/cmake/caches/FreeBSD.cmake
@@ -0,0 +1,9 @@
+set(CMAKE_BUILD_TYPE Release CACHE STRING "")
+set(CMAKE_POSITION_INDEPENDENT_CODE ON CACHE BOOL "")
+
+set(LIBCXX_ENABLE_ASSERTIONS OFF CACHE BOOL "")
+set(LIBCXX_ABI_VERSION "1" CACHE STRING "")
+set(LIBCXX_ENABLE_STATIC ON CACHE BOOL "")
+set(LIBCXX_ENABLE_SHARED ON CACHE BOOL "")
+set(LIBCXX_CXX_ABI libcxxrt CACHE STRING "")
+set(LIBCXX_ENABLE_NEW_DELETE_DEFINITIONS ON CACHE BOOL "")

From a29ff5bae41a06302e0c90b47387e3bff8bbdd8c Mon Sep 17 00:00:00 2001
From: Shilei Tian <tianshilei1992@gmail.com>
Date: Sat, 30 Jan 2021 15:14:41 -0500
Subject: [PATCH 099/318] [OpenMP][NVPTX] Refined CMake logic to choose compute
 capabilites

This patch refines the logic to choose compute capabilites via the
environment variable `LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES`. It supports the
following values (all case insensitive):
- "all": Build `deviceRTLs` for all supported compute capabilites;
- "auto": Only build for the compute capability auto detected. Note that this
  requires CUDA. If CUDA is not found, a CMake fatal error will be raised.
- "xx,yy" or "xx;yy": Build for compute capabilities `xx` and `yy`.

If `LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES` is not set, it is equivalent to set
it to `all`.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D95687

(cherry picked from commit 26d38f6d20ff137d89cb7c891b739662de1ca508)
---
 .../Modules/LibomptargetGetDependencies.cmake |  4 +--
 .../deviceRTLs/nvptx/CMakeLists.txt           | 30 +++++++++++--------
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/openmp/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake b/openmp/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake
index 28165ac1b8c0..e3c2a580396e 100644
--- a/openmp/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake
+++ b/openmp/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake
@@ -118,9 +118,7 @@ endif()
 find_package(CUDA QUIET)
 
 # Try to get the highest Nvidia GPU architecture the system supports
-set(LIBOMPTARGET_NVPTX_AUTODETECT_COMPUTE_CAPABILITY TRUE CACHE BOOL
-  "Auto detect CUDA Compute Capability if CUDA is detected.")
-if (CUDA_FOUND AND LIBOMPTARGET_NVPTX_AUTODETECT_COMPUTE_CAPABILITY)
+if (CUDA_FOUND)
   cuda_select_nvcc_arch_flags(CUDA_ARCH_FLAGS)
   string(REGEX MATCH "sm_([0-9]+)" CUDA_ARCH_MATCH_OUTPUT ${CUDA_ARCH_FLAGS})
   if (NOT DEFINED CUDA_ARCH_MATCH_OUTPUT OR "${CMAKE_MATCH_1}" LESS 35)
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
index eeda137ef120..b705e0bb6a9f 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -10,12 +10,12 @@
 #
 ##===----------------------------------------------------------------------===##
 
-# By default we will not build NVPTX deviceRTL on a non-CUDA
+# By default we will not build NVPTX deviceRTL on a CUDA free system
 set(LIBOMPTARGET_BUILD_NVPTX_BCLIB FALSE CACHE BOOL
-  "Whether build NVPTX deviceRTL on non-CUDA system.")
+  "Whether build NVPTX deviceRTL on CUDA free system.")
 
 if (NOT (LIBOMPTARGET_DEP_CUDA_FOUND OR LIBOMPTARGET_BUILD_NVPTX_BCLIB))
-  libomptarget_say("Not building NVPTX deviceRTL by default on non-CUDA system.")
+  libomptarget_say("Not building NVPTX deviceRTL by default on CUDA free system.")
   return()
 endif()
 
@@ -73,16 +73,22 @@ set(devicertl_common_directory
 set(devicertl_nvptx_directory
   ${devicertl_base_directory}/nvptx)
 
-if (DEFINED LIBOMPTARGET_DEP_CUDA_ARCH)
-  set(default_capabilities ${LIBOMPTARGET_DEP_CUDA_ARCH})
-else()
-  set(default_capabilities 35 37 50 52 53 60 61 62 70 72 75 80)
-endif()
+set(all_capabilities 35 37 50 52 53 60 61 62 70 72 75 80)
 
-set(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES ${default_capabilities} CACHE STRING
+set(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES ${all_capabilities} CACHE STRING
   "List of CUDA Compute Capabilities to be used to compile the NVPTX device RTL.")
+string(TOLOWER ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES} LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES)
 
-set(nvptx_sm_list ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES})
+if (LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES STREQUAL "all")
+  set(nvptx_sm_list ${all_capabilities})
+elseif(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES STREQUAL "auto")
+  if (NOT LIBOMPTARGET_DEP_CUDA_FOUND)
+    libomptarget_error_say("[NVPTX] Cannot auto detect compute capability as CUDA not found.")
+  endif()
+  set(nvptx_sm_list ${LIBOMPTARGET_DEP_CUDA_ARCH})
+else()
+  string(REPLACE "," ";" nvptx_sm_list "${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES}")
+endif()
 
 # If user set LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES to empty, we disable the
 # build.
@@ -93,8 +99,8 @@ endif()
 
 # Check all SM values
 foreach(sm ${nvptx_sm_list})
-  if (NOT ${sm} IN_LIST default_capabilities)
-    message(FATAL_ERROR "LIBOMPTARGET-NVPTX: compute capability ${sm} is not supported. Supported values: ${default_capabilities}")
+  if (NOT ${sm} IN_LIST all_capabilities)
+    libomptarget_warning_say("[NVPTX] Compute capability ${sm} is not supported. Make sure clang can work with it.")
   endif()
 endforeach()
 

From a4a4036d7aef942b6ff3a8c594937c0e0f6512b0 Mon Sep 17 00:00:00 2001
From: Zarko Todorovski <zarko@ca.ibm.com>
Date: Fri, 29 Jan 2021 14:05:17 -0500
Subject: [PATCH 100/318] [AIX] Actually push back "-mabi=vec-extabi" when
 option is on.

Accidentaly ommitted the portion of pushing back the option in
https://reviews.llvm.org/D94986

(cherry picked from commit caaaebcde462bf681498ce85c2659d683a07fc87)
---
 clang/lib/Driver/ToolChains/Clang.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index fdb8a58cd1b3..d75eb0c58d8b 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -4684,6 +4684,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
           << A->getSpelling() << RawTriple.str();
     if (A->getOption().getID() == options::OPT_mabi_EQ_vec_default)
       D.Diag(diag::err_aix_default_altivec_abi);
+    if (A->getOption().getID() == options::OPT_mabi_EQ_vec_extabi)
+      CmdArgs.push_back("-mabi=vec-extabi");
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_Wframe_larger_than_EQ)) {

From db9731e4500533e07ee3a0aaaf44c8f8c00ca2a8 Mon Sep 17 00:00:00 2001
From: Zarko Todorovski <zarko@ca.ibm.com>
Date: Tue, 2 Feb 2021 10:56:15 -0500
Subject: [PATCH 101/318] [AIX] Improve option processing for mabi=vec-extabi
 and mabi=vec=defaul

Opening this revision to better address comments by @hubert.reinterpretcast in https://reviews.llvm.org/rGcaaaebcde462

Reviewed By: hubert.reinterpretcast

Differential Revision: https://reviews.llvm.org/D95702

(cherry picked from commit eb3426a528d5b3cbbb54aee662a779f2067fc9db)
---
 clang/lib/Driver/ToolChains/Clang.cpp | 12 ++----------
 clang/test/CodeGen/altivec.c          |  3 ---
 2 files changed, 2 insertions(+), 13 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index d75eb0c58d8b..f8e637974662 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -4669,23 +4669,15 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     }
   }
 
-  if (Triple.isOSAIX() && Args.hasArg(options::OPT_maltivec)) {
-    if (Args.getLastArg(options::OPT_mabi_EQ_vec_extabi)) {
-      CmdArgs.push_back("-mabi=vec-extabi");
-    } else {
-      D.Diag(diag::err_aix_default_altivec_abi);
-    }
-  }
-
   if (Arg *A = Args.getLastArg(options::OPT_mabi_EQ_vec_extabi,
                                options::OPT_mabi_EQ_vec_default)) {
     if (!Triple.isOSAIX())
       D.Diag(diag::err_drv_unsupported_opt_for_target)
           << A->getSpelling() << RawTriple.str();
-    if (A->getOption().getID() == options::OPT_mabi_EQ_vec_default)
-      D.Diag(diag::err_aix_default_altivec_abi);
     if (A->getOption().getID() == options::OPT_mabi_EQ_vec_extabi)
       CmdArgs.push_back("-mabi=vec-extabi");
+    else
+      D.Diag(diag::err_aix_default_altivec_abi);
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_Wframe_larger_than_EQ)) {
diff --git a/clang/test/CodeGen/altivec.c b/clang/test/CodeGen/altivec.c
index d69c34d82190..86b570f15d08 100644
--- a/clang/test/CodeGen/altivec.c
+++ b/clang/test/CodeGen/altivec.c
@@ -6,9 +6,6 @@
 // RUN: %clang_cc1 -target-feature +altivec -mabi=vec-extabi -target-cpu pwr8 -triple powerpc64-unknown-aix -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-BE
 // RUN: not %clang_cc1 -target-feature +altivec -mabi=vec-default -target-cpu pwr8 -triple powerpc-unknown-aix -emit-llvm %s 2>&1 | FileCheck %s --check-prefix=AIX-ERROR
 // RUN: not %clang_cc1 -target-feature +altivec -mabi=vec-default -target-cpu pwr8 -triple powerpc64-unknown-aix -emit-llvm %s 2>&1 | FileCheck %s --check-prefix=AIX-ERROR
-
-// RUN: not %clang -S -emit-llvm -maltivec -mcpu=pwr8 -target powerpc-unknown-aix %s 2>&1 | FileCheck %s --check-prefix=AIX-ERROR
-// RUN: not %clang -S -emit-llvm -maltivec -mcpu=pwr8 -target powerpc64-unknown-aix %s 2>&1 | FileCheck %s --check-prefix=AIX-ERROR 
 // RUN: %clang -S -emit-llvm -maltivec -mabi=vec-extabi -mcpu=pwr8 -target powerpc-unknown-aix %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-BE
 // RUN: %clang -S -emit-llvm -maltivec -mabi=vec-extabi -mcpu=pwr8 -target powerpc64-unknown-aix %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-BE
 // RUN: not %clang -S -emit-llvm -maltivec -mabi=vec-default -mcpu=pwr8 -triple powerpc-unknown-aix -emit-llvm %s 2>&1 | FileCheck %s --check-prefix=AIX-ERROR

From f5b2787d07c1c61ebbda3f11dd770b6c57e2b662 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 4 Feb 2021 09:17:47 -0800
Subject: [PATCH 102/318] [ELF] Allow R_386_GOTOFF from .debug_info

In GCC emitted .debug_info sections, R_386_GOTOFF may be used to
relocate DW_AT_GNU_call_site_value values
(https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98946).

R_386_GOTOFF (`S + A - GOT`) is one of the `isStaticLinkTimeConstant` relocation
type which is not PC-relative, so it can be used from non-SHF_ALLOC sections. We
current allow new relocation types as needs come. The diagnostic has caught some
bugs in the past.

Differential Revision: https://reviews.llvm.org/D95994

(cherry picked from commit b3165a70ae83b46dc145f335dfa9690ece361e92)
---
 lld/ELF/InputSection.cpp     |  5 ++++-
 lld/test/ELF/non-abs-reloc.s | 18 ++++++++++++------
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index f40bb258b9af..6f16fc7abc48 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -901,7 +901,10 @@ void InputSection::relocateNonAlloc(uint8_t *buf, ArrayRef<RelTy> rels) {
       continue;
     }
 
-    if (expr != R_ABS && expr != R_DTPREL && expr != R_RISCV_ADD) {
+    // R_ABS/R_DTPREL and some other relocations can be used from non-SHF_ALLOC
+    // sections.
+    if (expr != R_ABS && expr != R_DTPREL && expr != R_GOTPLTREL &&
+        expr != R_RISCV_ADD) {
       std::string msg = getLocation<ELFT>(offset) +
                         ": has non-ABS relocation " + toString(type) +
                         " against symbol '" + toString(sym) + "'";
diff --git a/lld/test/ELF/non-abs-reloc.s b/lld/test/ELF/non-abs-reloc.s
index 72a65424ed1f..82f913efe4d8 100644
--- a/lld/test/ELF/non-abs-reloc.s
+++ b/lld/test/ELF/non-abs-reloc.s
@@ -1,17 +1,17 @@
 // REQUIRES: x86
 // RUN: split-file %s %t
-// RUN: llvm-mc -filetype=obj -triple=x86_64 %t/asm -o %t.o
-// RUN: ld.lld -T %t/lds %t.o -o %t.exe 2>&1 | FileCheck %s
-// CHECK:      warning: {{.*}}.o:(.nonalloc1+0x1): has non-ABS relocation R_X86_64_PC32 against symbol '_start'
-// CHECK-NEXT: warning: {{.*}}.o:(.nonalloc1+0x6): has non-ABS relocation R_X86_64_PC32 against symbol '_start'
+// RUN: llvm-mc -filetype=obj -triple=i386 %t/asm -o %t.o
+// RUN: ld.lld -T %t/lds %t.o -o %t.exe 2>&1 | FileCheck %s --implicit-check-not=warning: --implicit-check-not=error:
+// CHECK:      warning: {{.*}}.o:(.nonalloc1+0x1): has non-ABS relocation R_386_PC32 against symbol '_start'
+// CHECK-NEXT: warning: {{.*}}.o:(.nonalloc1+0x6): has non-ABS relocation R_386_PC32 against symbol '_start'
 
 // RUN: llvm-objdump -D --no-show-raw-insn %t.exe | FileCheck --check-prefix=DISASM %s
 // DISASM:      Disassembly of section .nonalloc:
 // DISASM-EMPTY:
 // DISASM-NEXT: <.nonalloc>:
 // DISASM-NEXT:   0: nop
-// DISASM-NEXT:   1: callq 0x0
-// DISASM-NEXT:   6: callq 0x0
+// DISASM-NEXT:   1: calll 0x0
+// DISASM-NEXT:   6: calll 0x0
 
 //--- lds
 SECTIONS {
@@ -20,6 +20,7 @@ SECTIONS {
 //--- asm
 .globl _start
 _start:
+.L0:
   nop
 
 .section .nonalloc0
@@ -30,3 +31,8 @@ _start:
   .long _start - . - 4
   .byte 0xe8
   .long _start - . - 4
+
+// GCC may relocate DW_AT_GNU_call_site_value with R_386_GOTOFF.
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98946
+.section .debug_info
+  .long .L0@gotoff

From a6ea391b832573830b011f26013ebaa946032250 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Mon, 8 Feb 2021 15:24:42 +0200
Subject: [PATCH 103/318] [AArch64] Use '//' as comment string for MSVC
 assembly

As the actual MSVC toolset doesn't use the GAS-style assembly that
Clang/LLVM produces and consumes, there's no reference for what
string to use for e.g. comments when building with a MSVC triple.

This frees up the use of semicolon as separator string, just like
was done for GNU targets in 23413195649d0cf6f3860ae8b5fb115b35032075.
(Previously, both the separator and comment strings were set to
the same, a semicolon.)

Compiler-rt extensively uses separator chars in its assembly,
and that assembly should be buildable with clang-cl for MSVC too.

Differential Revision: https://reviews.llvm.org/D96259

(cherry picked from commit 71c29b4cf3fb2b5610991bfbc12b8bda97d60005)
---
 .../AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp |   2 +-
 llvm/test/CodeGen/AArch64/cfguard-checks.ll   |   4 +-
 llvm/test/CodeGen/AArch64/landingpad-ifcvt.ll |   2 +-
 .../test/CodeGen/AArch64/reloc-specifiers.mir |   4 +-
 llvm/test/CodeGen/AArch64/seh_funclet_x1.ll   |   2 +-
 llvm/test/CodeGen/AArch64/win64-no-uwtable.ll |   4 +-
 .../CodeGen/AArch64/windows-extern-weak.ll    |   2 +-
 .../CodeGen/AArch64/wineh-try-catch-nobase.ll |   6 +-
 llvm/test/CodeGen/AArch64/wineh-try-catch.ll  |  18 +--
 llvm/test/MC/AArch64/coff-relocations.s       | 110 +++++++++---------
 .../AArch64/{coff-gnu.s => coff-separator.s}  |   2 +
 11 files changed, 79 insertions(+), 77 deletions(-)
 rename llvm/test/MC/AArch64/{coff-gnu.s => coff-separator.s} (74%)

diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 37c924d879b1..68c721cb0d72 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -111,7 +111,7 @@ AArch64MCAsmInfoMicrosoftCOFF::AArch64MCAsmInfoMicrosoftCOFF() {
   SupportsDebugInformation = true;
   CodePointerSize = 8;
 
-  CommentString = ";";
+  CommentString = "//";
   ExceptionsType = ExceptionHandling::WinEH;
   WinEHEncodingType = WinEH::EncodingType::Itanium;
 }
diff --git a/llvm/test/CodeGen/AArch64/cfguard-checks.ll b/llvm/test/CodeGen/AArch64/cfguard-checks.ll
index 66ec4b6ed074..6dc94e220712 100644
--- a/llvm/test/CodeGen/AArch64/cfguard-checks.ll
+++ b/llvm/test/CodeGen/AArch64/cfguard-checks.ll
@@ -96,8 +96,8 @@ lpad:                                             ; preds = %entry
 	; CHECK:        blr x9
   ; CHECK-NEXT:   .Ltmp0:
 	; CHECK-NEXT:   blr x8
-  ; CHECK:       ; %invoke.cont
-  ; CHECK:       ; %lpad
+  ; CHECK:       // %invoke.cont
+  ; CHECK:       // %lpad
 }
 
 declare void @h()
diff --git a/llvm/test/CodeGen/AArch64/landingpad-ifcvt.ll b/llvm/test/CodeGen/AArch64/landingpad-ifcvt.ll
index 4437970e1660..a5497b1d8e14 100644
--- a/llvm/test/CodeGen/AArch64/landingpad-ifcvt.ll
+++ b/llvm/test/CodeGen/AArch64/landingpad-ifcvt.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s | FileCheck %s
 
 ; Make sure this doesn't crash (and the output is sane).
-; CHECK: ; %__except.ret
+; CHECK: // %__except.ret
 ; CHECK-NEXT: mov     x0, xzr
 
 target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"
diff --git a/llvm/test/CodeGen/AArch64/reloc-specifiers.mir b/llvm/test/CodeGen/AArch64/reloc-specifiers.mir
index 374a4759b4cd..3c56874e441a 100644
--- a/llvm/test/CodeGen/AArch64/reloc-specifiers.mir
+++ b/llvm/test/CodeGen/AArch64/reloc-specifiers.mir
@@ -11,11 +11,11 @@ body: |
   bb.0:
     ; CHECK-LABEL: bar
 
-    ; CHECK: movz    x0, #:abs_g1_s:.Lfoo$frame_escape_0 ; encoding: [0bAAA00000,A,0b101AAAAA,0xd2]
+    ; CHECK: movz    x0, #:abs_g1_s:.Lfoo$frame_escape_0 // encoding: [0bAAA00000,A,0b101AAAAA,0xd2]
     ; CHECK: fixup A - offset: 0, value: :abs_g1_s:.Lfoo$frame_escape_0, kind: fixup_aarch64_movw
     renamable $x0 = MOVZXi target-flags(aarch64-g1, aarch64-s) <mcsymbol .Lfoo$frame_escape_0>, 16
 
-    ; CHECK: movk    x0, #:abs_g0_nc:.Lfoo$frame_escape_0 ; encoding: [0bAAA00000,A,0b100AAAAA,0xf2]
+    ; CHECK: movk    x0, #:abs_g0_nc:.Lfoo$frame_escape_0 // encoding: [0bAAA00000,A,0b100AAAAA,0xf2]
     ; CHECK: fixup A - offset: 0, value: :abs_g0_nc:.Lfoo$frame_escape_0, kind: fixup_aarch64_movw
     renamable $x0 = MOVKXi $x0, target-flags(aarch64-g0, aarch64-nc) <mcsymbol .Lfoo$frame_escape_0>, 0
 ...
diff --git a/llvm/test/CodeGen/AArch64/seh_funclet_x1.ll b/llvm/test/CodeGen/AArch64/seh_funclet_x1.ll
index 1f524716be9a..7f5a0324f9c0 100644
--- a/llvm/test/CodeGen/AArch64/seh_funclet_x1.ll
+++ b/llvm/test/CodeGen/AArch64/seh_funclet_x1.ll
@@ -5,7 +5,7 @@
 
 ; CHECK:      ?dtor$3@?0?main@4HA":
 ; CHECK:      .seh_proc "?dtor$3@?0?main@4HA"
-; CHECK:      stp     x29, x30, [sp, #-16]!   ; 16-byte Folded Spill
+; CHECK:      stp     x29, x30, [sp, #-16]!   // 16-byte Folded Spill
 ; CHECK-NEXT: .seh_save_fplr_x 16
 ; CHECK-NEXT: .seh_endprologue
 ; CHECK-NEXT: mov     x29, x1
diff --git a/llvm/test/CodeGen/AArch64/win64-no-uwtable.ll b/llvm/test/CodeGen/AArch64/win64-no-uwtable.ll
index f04a47cd1e44..789620a21dd6 100644
--- a/llvm/test/CodeGen/AArch64/win64-no-uwtable.ll
+++ b/llvm/test/CodeGen/AArch64/win64-no-uwtable.ll
@@ -13,11 +13,11 @@ define dso_local void @SEHfilter() nounwind "frame-pointer"="all" {
 ; CHECK-NEXT:  mov     x29, sp
 ; CHECK-NEXT:  bl      g
 ; CHECK-NEXT:  cbz     w19, .LBB0_2
-; CHECK-NEXT:  ; %bb.1:
+; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:  ldr     x19, [sp, #16]
 ; CHECK-NEXT:  ldp     x30, x29, [sp], #32
 ; CHECK-NEXT:  ret
-; CHECK-NEXT:  .LBB0_2:                                ; %if.end.i
+; CHECK-NEXT:  .LBB0_2:                                // %if.end.i
 ; CHECK-NEXT:  bl      f
 ; CHECK-NEXT:  brk     #0x1
   %1 = load i32, i32* undef, align 4
diff --git a/llvm/test/CodeGen/AArch64/windows-extern-weak.ll b/llvm/test/CodeGen/AArch64/windows-extern-weak.ll
index 18df2ddc5db4..dbd17e35f44a 100644
--- a/llvm/test/CodeGen/AArch64/windows-extern-weak.ll
+++ b/llvm/test/CodeGen/AArch64/windows-extern-weak.ll
@@ -10,7 +10,7 @@ define void @func() {
 ; CHECK-NEXT: adrp x8, .refptr.weakfunc
 ; CHECK-NEXT: ldr x8, [x8, :lo12:.refptr.weakfunc]
 ; CHECK-NEXT: cbz     x8, .LBB0_2
-; CHECK-NEXT: ; %bb.1:
+; CHECK-NEXT: // %bb.1:
 ; CHECK-NEXT: blr     x8
 ; CHECK-NEXT: .LBB0_2:
 ; CHECK-NEXT: .seh_startepilogue
diff --git a/llvm/test/CodeGen/AArch64/wineh-try-catch-nobase.ll b/llvm/test/CodeGen/AArch64/wineh-try-catch-nobase.ll
index bf1ebaa3d277..1552a554eb4e 100644
--- a/llvm/test/CodeGen/AArch64/wineh-try-catch-nobase.ll
+++ b/llvm/test/CodeGen/AArch64/wineh-try-catch-nobase.ll
@@ -6,16 +6,16 @@
 
 ; Check that we compute the address relative to fp.
 ; CHECK-LABEL: "?catch$2@?0??a@@YAXXZ@4HA":
-; CHECK:             stp     x29, x30, [sp, #-16]!   ; 16-byte Folded Spill
+; CHECK:             stp     x29, x30, [sp, #-16]!   // 16-byte Folded Spill
 ; CHECK-NEXT:        .seh_save_fplr_x 16
 ; CHECK-NEXT:        .seh_endprologue
-; CHECK-NEXT:        sub     x0, x29, #16            ; =16
+; CHECK-NEXT:        sub     x0, x29, #16            // =16
 ; CHECK-NEXT:        mov     x1, xzr
 ; CHECK-NEXT:        bl      "?bb@@YAXPEAHH@Z"
 ; CHECK-NEXT:        adrp    x0, .LBB0_1
 ; CHECK-NEXT:        add     x0, x0, .LBB0_1
 ; CHECK-NEXT:        .seh_startepilogue
-; CHECK-NEXT:        ldp     x29, x30, [sp], #16     ; 16-byte Folded Reload
+; CHECK-NEXT:        ldp     x29, x30, [sp], #16     // 16-byte Folded Reload
 ; CHECK-NEXT:        .seh_save_fplr_x 16
 ; CHECK-NEXT:        .seh_endepilogue
 ; CHECK-NEXT:        ret
diff --git a/llvm/test/CodeGen/AArch64/wineh-try-catch.ll b/llvm/test/CodeGen/AArch64/wineh-try-catch.ll
index 8bf5aa33e24a..7de7d60b8cab 100644
--- a/llvm/test/CodeGen/AArch64/wineh-try-catch.ll
+++ b/llvm/test/CodeGen/AArch64/wineh-try-catch.ll
@@ -41,7 +41,7 @@
 ; CHECK-LABEL: .Ltmp0:
 ; CHECK:       bl      "?func2@@YAHXZ
 
-; CHECK:        [[CATCHRETDEST:.LBB0_[0-9]+]]:      ; %catchret.dest
+; CHECK:        [[CATCHRETDEST:.LBB0_[0-9]+]]:      // %catchret.dest
 
 ; Check the catch funclet.
 ; CHECK-LABEL: "?catch$2@?0??func@@YAHXZ@4HA":
@@ -74,14 +74,14 @@
 ; entry to func is encoded in cppxdata that is passed to __CxxFrameHandler3.  As
 ; computed above, this comes to -16.
 ; CHECK-LABEL:        "$cppxdata$?func@@YAHXZ":
-; CHECK-NEXT:         .word   429065506               ; MagicNumber
-; CHECK-NEXT:         .word   2                       ; MaxState
-; CHECK-NEXT:         .word   ("$stateUnwindMap$?func@@YAHXZ")@IMGREL ; UnwindMap
-; CHECK-NEXT:         .word   1                       ; NumTryBlocks
-; CHECK-NEXT:         .word   ("$tryMap$?func@@YAHXZ")@IMGREL ; TryBlockMap
-; CHECK-NEXT:         .word   4                       ; IPMapEntries
-; CHECK-NEXT:         .word   ("$ip2state$?func@@YAHXZ")@IMGREL ; IPToStateXData
-; CHECK-NEXT:         .word   -16                     ; UnwindHelp
+; CHECK-NEXT:         .word   429065506               // MagicNumber
+; CHECK-NEXT:         .word   2                       // MaxState
+; CHECK-NEXT:         .word   ("$stateUnwindMap$?func@@YAHXZ")@IMGREL // UnwindMap
+; CHECK-NEXT:         .word   1                       // NumTryBlocks
+; CHECK-NEXT:         .word   ("$tryMap$?func@@YAHXZ")@IMGREL // TryBlockMap
+; CHECK-NEXT:         .word   4                       // IPMapEntries
+; CHECK-NEXT:         .word   ("$ip2state$?func@@YAHXZ")@IMGREL // IPToStateXData
+; CHECK-NEXT:         .word   -16                     // UnwindHelp
 
 ; UNWIND: Function: ?func@@YAHXZ (0x0)
 ; UNWIND: Prologue [
diff --git a/llvm/test/MC/AArch64/coff-relocations.s b/llvm/test/MC/AArch64/coff-relocations.s
index 54706fd897e6..6433109d5634 100644
--- a/llvm/test/MC/AArch64/coff-relocations.s
+++ b/llvm/test/MC/AArch64/coff-relocations.s
@@ -1,41 +1,41 @@
-; RUN: llvm-mc -triple aarch64-windows -filetype obj -o %t.obj %s
-; RUN: llvm-readobj -r %t.obj | FileCheck %s
-; RUN: llvm-objdump -d %t.obj | FileCheck %s --check-prefix=DISASM
+// RUN: llvm-mc -triple aarch64-windows -filetype obj -o %t.obj %s
+// RUN: llvm-readobj -r %t.obj | FileCheck %s
+// RUN: llvm-objdump -d %t.obj | FileCheck %s --check-prefix=DISASM
 
-; IMAGE_REL_ARM64_ADDR32
+// IMAGE_REL_ARM64_ADDR32
 .Linfo_foo:
   .asciz "foo"
   .long foo
 
-; IMAGE_REL_ARM64_ADDR32NB
+// IMAGE_REL_ARM64_ADDR32NB
 .long func@IMGREL
 
-; IMAGE_REL_ARM64_ADDR64
+// IMAGE_REL_ARM64_ADDR64
 .globl struc
 struc:
   .quad arr
 
-; IMAGE_REL_ARM64_BRANCH26
+// IMAGE_REL_ARM64_BRANCH26
 b target
 
-; IMAGE_REL_ARM64_PAGEBASE_REL21
+// IMAGE_REL_ARM64_PAGEBASE_REL21
 adrp x0, foo
 
-; IMAGE_REL_ARM64_PAGEOFFSET_12A
+// IMAGE_REL_ARM64_PAGEOFFSET_12A
 add x0, x0, :lo12:foo
 
-; IMAGE_REL_ARM64_PAGEOFFSET_12L
+// IMAGE_REL_ARM64_PAGEOFFSET_12L
 ldr x0, [x0, :lo12:foo]
 
-; IMAGE_REL_ARM64_PAGEBASE_REL21, even if the symbol offset is known
+// IMAGE_REL_ARM64_PAGEBASE_REL21, even if the symbol offset is known
 adrp x0, bar
 bar:
 
-; IMAGE_REL_ARM64_SECREL
+// IMAGE_REL_ARM64_SECREL
 .secrel32 .Linfo_bar
 .Linfo_bar:
 
-; IMAGE_REL_ARM64_SECTION
+// IMAGE_REL_ARM64_SECTION
 .secidx func
 
 .align 2
@@ -45,55 +45,55 @@ add x0, x0, :lo12:foo + 0x12345
 ldrb w0, [x0, :lo12:foo + 0x12345]
 ldr x0, [x0, :lo12:foo + 0x12348]
 
-; IMAGE_REL_ARM64_SECREL_LOW12A
+// IMAGE_REL_ARM64_SECREL_LOW12A
 add x0, x0, :secrel_lo12:foo
-; IMAGE_REL_ARM64_SECREL_HIGH12A
+// IMAGE_REL_ARM64_SECREL_HIGH12A
 add x0, x0, :secrel_hi12:foo
-; IMAGE_REL_ARM64_SECREL_LOW12L
+// IMAGE_REL_ARM64_SECREL_LOW12L
 ldr x0, [x0, :secrel_lo12:foo]
 
-; IMAGE_REL_ARM64_REL21
+// IMAGE_REL_ARM64_REL21
 adr x0, foo + 0x12345
 
-; IMAGE_REL_ARM64_BRANCH19
+// IMAGE_REL_ARM64_BRANCH19
 bne target
 
-; IMAGE_REL_ARM64_BRANCH14
+// IMAGE_REL_ARM64_BRANCH14
 tbz x0, #0, target
 
-; CHECK: Format: COFF-ARM64
-; CHECK: Arch: aarch64
-; CHECK: AddressSize: 64bit
-; CHECK: Relocations [
-; CHECK:   Section (1) .text {
-; CHECK: 0x4 IMAGE_REL_ARM64_ADDR32 foo
-; CHECK: 0x8 IMAGE_REL_ARM64_ADDR32NB func
-; CHECK: 0xC IMAGE_REL_ARM64_ADDR64 arr
-; CHECK: 0x14 IMAGE_REL_ARM64_BRANCH26 target
-; CHECK: 0x18 IMAGE_REL_ARM64_PAGEBASE_REL21 foo
-; CHECK: 0x1C IMAGE_REL_ARM64_PAGEOFFSET_12A foo
-; CHECK: 0x20 IMAGE_REL_ARM64_PAGEOFFSET_12L foo
-; CHECK: 0x24 IMAGE_REL_ARM64_PAGEBASE_REL21 bar
-; CHECK: 0x28 IMAGE_REL_ARM64_SECREL .text
-; CHECK: 0x2C IMAGE_REL_ARM64_SECTION func
-; CHECK: 0x30 IMAGE_REL_ARM64_PAGEBASE_REL21 baz
-; CHECK: 0x34 IMAGE_REL_ARM64_PAGEOFFSET_12A foo
-; CHECK: 0x38 IMAGE_REL_ARM64_PAGEOFFSET_12L foo
-; CHECK: 0x3C IMAGE_REL_ARM64_PAGEOFFSET_12L foo
-; CHECK: 0x40 IMAGE_REL_ARM64_SECREL_LOW12A foo
-; CHECK: 0x44 IMAGE_REL_ARM64_SECREL_HIGH12A foo
-; CHECK: 0x48 IMAGE_REL_ARM64_SECREL_LOW12L foo
-; CHECK: 0x4C IMAGE_REL_ARM64_REL21 foo
-; CHECK: 0x50 IMAGE_REL_ARM64_BRANCH19 target
-; CHECK: 0x54 IMAGE_REL_ARM64_BRANCH14 target
-; CHECK:   }
-; CHECK: ]
-
-; DISASM: 30:       20 1a 09 b0     adrp    x0, 0x12345000
-; DISASM: 34:       00 14 0d 91     add     x0, x0, #837
-; DISASM: 38:       00 14 4d 39     ldrb    w0, [x0, #837]
-; DISASM: 3c:       00 a4 41 f9     ldr     x0, [x0, #840]
-; DISASM: 40:       00 00 00 91     add     x0, x0, #0
-; DISASM: 44:       00 00 40 91     add     x0, x0, #0, lsl #12
-; DISASM: 48:       00 00 40 f9     ldr     x0, [x0]
-; DISASM: 4c:       20 1a 09 30     adr     x0, #74565
+// CHECK: Format: COFF-ARM64
+// CHECK: Arch: aarch64
+// CHECK: AddressSize: 64bit
+// CHECK: Relocations [
+// CHECK:   Section (1) .text {
+// CHECK: 0x4 IMAGE_REL_ARM64_ADDR32 foo
+// CHECK: 0x8 IMAGE_REL_ARM64_ADDR32NB func
+// CHECK: 0xC IMAGE_REL_ARM64_ADDR64 arr
+// CHECK: 0x14 IMAGE_REL_ARM64_BRANCH26 target
+// CHECK: 0x18 IMAGE_REL_ARM64_PAGEBASE_REL21 foo
+// CHECK: 0x1C IMAGE_REL_ARM64_PAGEOFFSET_12A foo
+// CHECK: 0x20 IMAGE_REL_ARM64_PAGEOFFSET_12L foo
+// CHECK: 0x24 IMAGE_REL_ARM64_PAGEBASE_REL21 bar
+// CHECK: 0x28 IMAGE_REL_ARM64_SECREL .text
+// CHECK: 0x2C IMAGE_REL_ARM64_SECTION func
+// CHECK: 0x30 IMAGE_REL_ARM64_PAGEBASE_REL21 baz
+// CHECK: 0x34 IMAGE_REL_ARM64_PAGEOFFSET_12A foo
+// CHECK: 0x38 IMAGE_REL_ARM64_PAGEOFFSET_12L foo
+// CHECK: 0x3C IMAGE_REL_ARM64_PAGEOFFSET_12L foo
+// CHECK: 0x40 IMAGE_REL_ARM64_SECREL_LOW12A foo
+// CHECK: 0x44 IMAGE_REL_ARM64_SECREL_HIGH12A foo
+// CHECK: 0x48 IMAGE_REL_ARM64_SECREL_LOW12L foo
+// CHECK: 0x4C IMAGE_REL_ARM64_REL21 foo
+// CHECK: 0x50 IMAGE_REL_ARM64_BRANCH19 target
+// CHECK: 0x54 IMAGE_REL_ARM64_BRANCH14 target
+// CHECK:   }
+// CHECK: ]
+
+// DISASM: 30:       20 1a 09 b0     adrp    x0, 0x12345000
+// DISASM: 34:       00 14 0d 91     add     x0, x0, #837
+// DISASM: 38:       00 14 4d 39     ldrb    w0, [x0, #837]
+// DISASM: 3c:       00 a4 41 f9     ldr     x0, [x0, #840]
+// DISASM: 40:       00 00 00 91     add     x0, x0, #0
+// DISASM: 44:       00 00 40 91     add     x0, x0, #0, lsl #12
+// DISASM: 48:       00 00 40 f9     ldr     x0, [x0]
+// DISASM: 4c:       20 1a 09 30     adr     x0, #74565
diff --git a/llvm/test/MC/AArch64/coff-gnu.s b/llvm/test/MC/AArch64/coff-separator.s
similarity index 74%
rename from llvm/test/MC/AArch64/coff-gnu.s
rename to llvm/test/MC/AArch64/coff-separator.s
index df0dc7d33cfb..7535cf0571b9 100644
--- a/llvm/test/MC/AArch64/coff-gnu.s
+++ b/llvm/test/MC/AArch64/coff-separator.s
@@ -1,5 +1,7 @@
 // RUN: llvm-mc -triple aarch64-windows-gnu -filetype obj -o %t.obj %s
 // RUN: llvm-objdump -d %t.obj | FileCheck %s
+// RUN: llvm-mc -triple aarch64-windows-msvc -filetype obj -o %t.obj %s
+// RUN: llvm-objdump -d %t.obj | FileCheck %s
 
 func:
 // Check that the nop instruction after the semicolon also is handled

From 5ae2b9726f27b571de71542ef4d59ebceee8aca7 Mon Sep 17 00:00:00 2001
From: Michael Liao <michael.hliao@gmail.com>
Date: Thu, 4 Feb 2021 11:05:35 -0500
Subject: [PATCH 104/318] Recommit of a2fdf9d4d734732a6fa9288f1ffdf12bf8618123.

- The failures are all cc1-based tests due to the missing `-aux-triple` options,
which is always prepared by the driver in CUDA/HIP compilation.
- Add extra check on the missing aux-targetinfo to prevent crashing.

[hip][cuda] Enable extended lambda support on Windows.

- On Windows, extended lambda has extra issues due to the numbering
schemes are different between the host compilation (Microsoft C++ ABI)
and the device compilation (Itanium C++ ABI. Additional device side
lambda number is required per lambda for the host compilation to
correctly mangle the device-side lambda name.
- A hybrid numbering context `MSHIPNumberingContext` is introduced to
number a lambda for both host- and device-compilations.

Reviewed By: rnk

Differential Revision: https://reviews.llvm.org/D69322

This reverts commit 4874ff02417916cc9ff994b34abcb5e563056546.

(cherry picked from commit 01bf529db2cf465b029e29e537807576bfcbc452)
---
 clang/include/clang/AST/ASTContext.h          |  3 ++
 clang/include/clang/AST/DeclCXX.h             |  6 ++++
 clang/include/clang/AST/Mangle.h              |  3 ++
 .../clang/AST/MangleNumberingContext.h        |  5 +++
 clang/include/clang/Sema/Sema.h               |  2 +-
 clang/lib/AST/ASTImporter.cpp                 |  2 ++
 clang/lib/AST/CXXABI.h                        |  5 ++-
 clang/lib/AST/DeclCXX.cpp                     | 14 ++++++++
 clang/lib/AST/ItaniumCXXABI.cpp               |  6 ++++
 clang/lib/AST/ItaniumMangle.cpp               | 16 ++++++++-
 clang/lib/AST/MicrosoftCXXABI.cpp             | 33 +++++++++++++++++--
 clang/lib/CodeGen/CGCUDANV.cpp                |  8 +++++
 clang/lib/Sema/SemaLambda.cpp                 | 10 +++---
 clang/lib/Sema/TreeTransform.h                |  7 ++--
 clang/lib/Serialization/ASTReaderDecl.cpp     |  1 +
 clang/lib/Serialization/ASTWriter.cpp         |  1 +
 clang/test/CodeGenCUDA/unnamed-types.cu       | 27 +++++++++++++--
 17 files changed, 134 insertions(+), 15 deletions(-)

diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index ce47d54e44b0..ae69a68608b7 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -538,6 +538,9 @@ class ASTContext : public RefCountedBase<ASTContext> {
   /// need them (like static local vars).
   llvm::MapVector<const NamedDecl *, unsigned> MangleNumbers;
   llvm::MapVector<const VarDecl *, unsigned> StaticLocalNumbers;
+  /// Mapping the associated device lambda mangling number if present.
+  mutable llvm::DenseMap<const CXXRecordDecl *, unsigned>
+      DeviceLambdaManglingNumbers;
 
   /// Mapping that stores parameterIndex values for ParmVarDecls when
   /// that value exceeds the bitfield size of ParmVarDeclBits.ParameterIndex.
diff --git a/clang/include/clang/AST/DeclCXX.h b/clang/include/clang/AST/DeclCXX.h
index e32101bb2276..89006b1cfa7f 100644
--- a/clang/include/clang/AST/DeclCXX.h
+++ b/clang/include/clang/AST/DeclCXX.h
@@ -1735,6 +1735,12 @@ class CXXRecordDecl : public RecordDecl {
     getLambdaData().HasKnownInternalLinkage = HasKnownInternalLinkage;
   }
 
+  /// Set the device side mangling number.
+  void setDeviceLambdaManglingNumber(unsigned Num) const;
+
+  /// Retrieve the device side mangling number.
+  unsigned getDeviceLambdaManglingNumber() const;
+
   /// Returns the inheritance model used for this record.
   MSInheritanceModel getMSInheritanceModel() const;
 
diff --git a/clang/include/clang/AST/Mangle.h b/clang/include/clang/AST/Mangle.h
index 0e8d6dd53d8a..7b6495d85eb6 100644
--- a/clang/include/clang/AST/Mangle.h
+++ b/clang/include/clang/AST/Mangle.h
@@ -96,6 +96,9 @@ class MangleContext {
   virtual bool shouldMangleCXXName(const NamedDecl *D) = 0;
   virtual bool shouldMangleStringLiteral(const StringLiteral *SL) = 0;
 
+  virtual bool isDeviceMangleContext() const { return false; }
+  virtual void setDeviceMangleContext(bool) {}
+
   // FIXME: consider replacing raw_ostream & with something like SmallString &.
   void mangleName(GlobalDecl GD, raw_ostream &);
   virtual void mangleCXXName(GlobalDecl GD, raw_ostream &) = 0;
diff --git a/clang/include/clang/AST/MangleNumberingContext.h b/clang/include/clang/AST/MangleNumberingContext.h
index f1ca6a05dbaf..eb33759682d6 100644
--- a/clang/include/clang/AST/MangleNumberingContext.h
+++ b/clang/include/clang/AST/MangleNumberingContext.h
@@ -52,6 +52,11 @@ class MangleNumberingContext {
   /// this context.
   virtual unsigned getManglingNumber(const TagDecl *TD,
                                      unsigned MSLocalManglingNumber) = 0;
+
+  /// Retrieve the mangling number of a new lambda expression with the
+  /// given call operator within the device context. No device number is
+  /// assigned if there's no device numbering context is associated.
+  virtual unsigned getDeviceManglingNumber(const CXXMethodDecl *) { return 0; }
 };
 
 } // end namespace clang
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 42814f6ba8f6..2530a2776373 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -6558,7 +6558,7 @@ class Sema final {
   /// Number lambda for linkage purposes if necessary.
   void handleLambdaNumbering(
       CXXRecordDecl *Class, CXXMethodDecl *Method,
-      Optional<std::tuple<unsigned, bool, Decl *>> Mangling = None);
+      Optional<std::tuple<bool, unsigned, unsigned, Decl *>> Mangling = None);
 
   /// Endow the lambda scope info with the relevant properties.
   void buildLambdaScope(sema::LambdaScopeInfo *LSI,
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 085c50c0667b..0d723fbbcd8c 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -2848,6 +2848,8 @@ ExpectedDecl ASTNodeImporter::VisitRecordDecl(RecordDecl *D) {
         return CDeclOrErr.takeError();
       D2CXX->setLambdaMangling(DCXX->getLambdaManglingNumber(), *CDeclOrErr,
                                DCXX->hasKnownLambdaInternalLinkage());
+      D2CXX->setDeviceLambdaManglingNumber(
+          DCXX->getDeviceLambdaManglingNumber());
    } else if (DCXX->isInjectedClassName()) {
       // We have to be careful to do a similar dance to the one in
       // Sema::ActOnStartCXXMemberDeclarations
diff --git a/clang/lib/AST/CXXABI.h b/clang/lib/AST/CXXABI.h
index 31cb36918726..ca9424bcb7a4 100644
--- a/clang/lib/AST/CXXABI.h
+++ b/clang/lib/AST/CXXABI.h
@@ -22,8 +22,9 @@ class ASTContext;
 class CXXConstructorDecl;
 class DeclaratorDecl;
 class Expr;
-class MemberPointerType;
+class MangleContext;
 class MangleNumberingContext;
+class MemberPointerType;
 
 /// Implements C++ ABI-specific semantic analysis functions.
 class CXXABI {
@@ -75,6 +76,8 @@ class CXXABI {
 /// Creates an instance of a C++ ABI class.
 CXXABI *CreateItaniumCXXABI(ASTContext &Ctx);
 CXXABI *CreateMicrosoftCXXABI(ASTContext &Ctx);
+std::unique_ptr<MangleNumberingContext>
+createItaniumNumberingContext(MangleContext *);
 }
 
 #endif
diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp
index 0368ada0b81c..0375f9b4432e 100644
--- a/clang/lib/AST/DeclCXX.cpp
+++ b/clang/lib/AST/DeclCXX.cpp
@@ -1593,6 +1593,20 @@ Decl *CXXRecordDecl::getLambdaContextDecl() const {
   return getLambdaData().ContextDecl.get(Source);
 }
 
+void CXXRecordDecl::setDeviceLambdaManglingNumber(unsigned Num) const {
+  assert(isLambda() && "Not a lambda closure type!");
+  if (Num)
+    getASTContext().DeviceLambdaManglingNumbers[this] = Num;
+}
+
+unsigned CXXRecordDecl::getDeviceLambdaManglingNumber() const {
+  assert(isLambda() && "Not a lambda closure type!");
+  auto I = getASTContext().DeviceLambdaManglingNumbers.find(this);
+  if (I != getASTContext().DeviceLambdaManglingNumbers.end())
+    return I->second;
+  return 0;
+}
+
 static CanQualType GetConversionType(ASTContext &Context, NamedDecl *Conv) {
   QualType T =
       cast<CXXConversionDecl>(Conv->getUnderlyingDecl()->getAsFunction())
diff --git a/clang/lib/AST/ItaniumCXXABI.cpp b/clang/lib/AST/ItaniumCXXABI.cpp
index 069add8464ae..be10258a2d77 100644
--- a/clang/lib/AST/ItaniumCXXABI.cpp
+++ b/clang/lib/AST/ItaniumCXXABI.cpp
@@ -258,3 +258,9 @@ class ItaniumCXXABI : public CXXABI {
 CXXABI *clang::CreateItaniumCXXABI(ASTContext &Ctx) {
   return new ItaniumCXXABI(Ctx);
 }
+
+std::unique_ptr<MangleNumberingContext>
+clang::createItaniumNumberingContext(MangleContext *Mangler) {
+  return std::make_unique<ItaniumNumberingContext>(
+      cast<ItaniumMangleContext>(Mangler));
+}
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 4420f6a2c1c3..5cad84a96845 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -125,6 +125,8 @@ class ItaniumMangleContextImpl : public ItaniumMangleContext {
   llvm::DenseMap<DiscriminatorKeyTy, unsigned> Discriminator;
   llvm::DenseMap<const NamedDecl*, unsigned> Uniquifier;
 
+  bool IsDevCtx = false;
+
 public:
   explicit ItaniumMangleContextImpl(ASTContext &Context,
                                     DiagnosticsEngine &Diags)
@@ -137,6 +139,10 @@ class ItaniumMangleContextImpl : public ItaniumMangleContext {
   bool shouldMangleStringLiteral(const StringLiteral *) override {
     return false;
   }
+
+  bool isDeviceMangleContext() const override { return IsDevCtx; }
+  void setDeviceMangleContext(bool IsDev) override { IsDevCtx = IsDev; }
+
   void mangleCXXName(GlobalDecl GD, raw_ostream &) override;
   void mangleThunk(const CXXMethodDecl *MD, const ThunkInfo &Thunk,
                    raw_ostream &) override;
@@ -1846,7 +1852,15 @@ void CXXNameMangler::mangleLambda(const CXXRecordDecl *Lambda) {
   // (in lexical order) with that same <lambda-sig> and context.
   //
   // The AST keeps track of the number for us.
-  unsigned Number = Lambda->getLambdaManglingNumber();
+  //
+  // In CUDA/HIP, to ensure the consistent lamba numbering between the device-
+  // and host-side compilations, an extra device mangle context may be created
+  // if the host-side CXX ABI has different numbering for lambda. In such case,
+  // if the mangle context is that device-side one, use the device-side lambda
+  // mangling number for this lambda.
+  unsigned Number = Context.isDeviceMangleContext()
+                        ? Lambda->getDeviceLambdaManglingNumber()
+                        : Lambda->getLambdaManglingNumber();
   assert(Number > 0 && "Lambda should be mangled as an unnamed class");
   if (Number > 1)
     mangleNumber(Number - 2);
diff --git a/clang/lib/AST/MicrosoftCXXABI.cpp b/clang/lib/AST/MicrosoftCXXABI.cpp
index f9f9fe985b6f..166aa3b3bd60 100644
--- a/clang/lib/AST/MicrosoftCXXABI.cpp
+++ b/clang/lib/AST/MicrosoftCXXABI.cpp
@@ -16,6 +16,7 @@
 #include "clang/AST/Attr.h"
 #include "clang/AST/CXXInheritance.h"
 #include "clang/AST/DeclCXX.h"
+#include "clang/AST/Mangle.h"
 #include "clang/AST/MangleNumberingContext.h"
 #include "clang/AST/RecordLayout.h"
 #include "clang/AST/Type.h"
@@ -64,6 +65,19 @@ class MicrosoftNumberingContext : public MangleNumberingContext {
   }
 };
 
+class MSHIPNumberingContext : public MicrosoftNumberingContext {
+  std::unique_ptr<MangleNumberingContext> DeviceCtx;
+
+public:
+  MSHIPNumberingContext(MangleContext *DeviceMangler) {
+    DeviceCtx = createItaniumNumberingContext(DeviceMangler);
+  }
+
+  unsigned getDeviceManglingNumber(const CXXMethodDecl *CallOperator) override {
+    return DeviceCtx->getManglingNumber(CallOperator);
+  }
+};
+
 class MicrosoftCXXABI : public CXXABI {
   ASTContext &Context;
   llvm::SmallDenseMap<CXXRecordDecl *, CXXConstructorDecl *> RecordToCopyCtor;
@@ -73,8 +87,20 @@ class MicrosoftCXXABI : public CXXABI {
   llvm::SmallDenseMap<TagDecl *, TypedefNameDecl *>
       UnnamedTagDeclToTypedefNameDecl;
 
+  // MangleContext for device numbering context, which is based on Itanium C++
+  // ABI.
+  std::unique_ptr<MangleContext> DeviceMangler;
+
 public:
-  MicrosoftCXXABI(ASTContext &Ctx) : Context(Ctx) { }
+  MicrosoftCXXABI(ASTContext &Ctx) : Context(Ctx) {
+    if (Context.getLangOpts().CUDA && Context.getAuxTargetInfo()) {
+      assert(Context.getTargetInfo().getCXXABI().isMicrosoft() &&
+             Context.getAuxTargetInfo()->getCXXABI().isItaniumFamily() &&
+             "Unexpected combination of C++ ABIs.");
+      DeviceMangler.reset(
+          Context.createMangleContext(Context.getAuxTargetInfo()));
+    }
+  }
 
   MemberPointerInfo
   getMemberPointerInfo(const MemberPointerType *MPT) const override;
@@ -133,6 +159,10 @@ class MicrosoftCXXABI : public CXXABI {
 
   std::unique_ptr<MangleNumberingContext>
   createMangleNumberingContext() const override {
+    if (Context.getLangOpts().CUDA && Context.getAuxTargetInfo()) {
+      assert(DeviceMangler && "Missing device mangler");
+      return std::make_unique<MSHIPNumberingContext>(DeviceMangler.get());
+    }
     return std::make_unique<MicrosoftNumberingContext>();
   }
 };
@@ -266,4 +296,3 @@ CXXABI::MemberPointerInfo MicrosoftCXXABI::getMemberPointerInfo(
 CXXABI *clang::CreateMicrosoftCXXABI(ASTContext &Ctx) {
   return new MicrosoftCXXABI(Ctx);
 }
-
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 33a2d6f4483e..e03631a7243a 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -184,6 +184,14 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
   CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy));
   VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy));
   VoidPtrPtrTy = VoidPtrTy->getPointerTo();
+  if (CGM.getContext().getAuxTargetInfo()) {
+    // If the host and device have different C++ ABIs, mark it as the device
+    // mangle context so that the mangling needs to retrieve the additonal
+    // device lambda mangling number instead of the regular host one.
+    DeviceMC->setDeviceMangleContext(
+        CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
+        CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily());
+  }
 }
 
 llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
diff --git a/clang/lib/Sema/SemaLambda.cpp b/clang/lib/Sema/SemaLambda.cpp
index af61c82c2002..c1c6a4bf5c68 100644
--- a/clang/lib/Sema/SemaLambda.cpp
+++ b/clang/lib/Sema/SemaLambda.cpp
@@ -432,15 +432,16 @@ CXXMethodDecl *Sema::startLambdaDefinition(CXXRecordDecl *Class,
 
 void Sema::handleLambdaNumbering(
     CXXRecordDecl *Class, CXXMethodDecl *Method,
-    Optional<std::tuple<unsigned, bool, Decl *>> Mangling) {
+    Optional<std::tuple<bool, unsigned, unsigned, Decl *>> Mangling) {
   if (Mangling) {
-    unsigned ManglingNumber;
     bool HasKnownInternalLinkage;
+    unsigned ManglingNumber, DeviceManglingNumber;
     Decl *ManglingContextDecl;
-    std::tie(ManglingNumber, HasKnownInternalLinkage, ManglingContextDecl) =
-        Mangling.getValue();
+    std::tie(HasKnownInternalLinkage, ManglingNumber, DeviceManglingNumber,
+             ManglingContextDecl) = Mangling.getValue();
     Class->setLambdaMangling(ManglingNumber, ManglingContextDecl,
                              HasKnownInternalLinkage);
+    Class->setDeviceLambdaManglingNumber(DeviceManglingNumber);
     return;
   }
 
@@ -476,6 +477,7 @@ void Sema::handleLambdaNumbering(
     unsigned ManglingNumber = MCtx->getManglingNumber(Method);
     Class->setLambdaMangling(ManglingNumber, ManglingContextDecl,
                              HasKnownInternalLinkage);
+    Class->setDeviceLambdaManglingNumber(MCtx->getDeviceManglingNumber(Method));
   }
 }
 
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 0a596e50658b..3c68f9458e58 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -12504,10 +12504,11 @@ TreeTransform<Derived>::TransformLambdaExpr(LambdaExpr *E) {
                                         E->getCaptureDefault());
   getDerived().transformedLocalDecl(OldClass, {Class});
 
-  Optional<std::tuple<unsigned, bool, Decl *>> Mangling;
+  Optional<std::tuple<bool, unsigned, unsigned, Decl *>> Mangling;
   if (getDerived().ReplacingOriginal())
-    Mangling = std::make_tuple(OldClass->getLambdaManglingNumber(),
-                               OldClass->hasKnownLambdaInternalLinkage(),
+    Mangling = std::make_tuple(OldClass->hasKnownLambdaInternalLinkage(),
+                               OldClass->getLambdaManglingNumber(),
+                               OldClass->getDeviceLambdaManglingNumber(),
                                OldClass->getLambdaContextDecl());
 
   // Build the call operator.
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 6bfb9bd783b5..18ab4666a7d8 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -1748,6 +1748,7 @@ void ASTDeclReader::ReadCXXDefinitionData(
     Lambda.NumExplicitCaptures = Record.readInt();
     Lambda.HasKnownInternalLinkage = Record.readInt();
     Lambda.ManglingNumber = Record.readInt();
+    D->setDeviceLambdaManglingNumber(Record.readInt());
     Lambda.ContextDecl = readDeclID();
     Lambda.Captures = (Capture *)Reader.getContext().Allocate(
         sizeof(Capture) * Lambda.NumCaptures);
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 6bfa7b0e7d6d..40900af6f9e0 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -5667,6 +5667,7 @@ void ASTRecordWriter::AddCXXDefinitionData(const CXXRecordDecl *D) {
     Record->push_back(Lambda.NumExplicitCaptures);
     Record->push_back(Lambda.HasKnownInternalLinkage);
     Record->push_back(Lambda.ManglingNumber);
+    Record->push_back(D->getDeviceLambdaManglingNumber());
     AddDeclRef(D->getLambdaContextDecl());
     AddTypeSourceInfo(Lambda.MethodTyInfo);
     for (unsigned I = 0, N = Lambda.NumCaptures; I != N; ++I) {
diff --git a/clang/test/CodeGenCUDA/unnamed-types.cu b/clang/test/CodeGenCUDA/unnamed-types.cu
index 59bfa6d7a18f..f598117d969d 100644
--- a/clang/test/CodeGenCUDA/unnamed-types.cu
+++ b/clang/test/CodeGenCUDA/unnamed-types.cu
@@ -1,12 +1,17 @@
 // RUN: %clang_cc1 -std=c++11 -x hip -triple x86_64-linux-gnu -aux-triple amdgcn-amd-amdhsa -emit-llvm %s -o - | FileCheck %s --check-prefix=HOST
+// RUN: %clang_cc1 -std=c++11 -x hip -triple x86_64-pc-windows-msvc -aux-triple amdgcn-amd-amdhsa -emit-llvm %s -o - | FileCheck %s --check-prefix=MSVC
 // RUN: %clang_cc1 -std=c++11 -x hip -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm %s -o - | FileCheck %s --check-prefix=DEVICE
 
 #include "Inputs/cuda.h"
 
 // HOST: @0 = private unnamed_addr constant [43 x i8] c"_Z2k0IZZ2f1PfENKUlS0_E_clES0_EUlfE_EvS0_T_\00", align 1
+// HOST: @1 = private unnamed_addr constant [60 x i8] c"_Z2k1IZ2f1PfEUlfE_Z2f1S0_EUlffE_Z2f1S0_EUlfE0_EvS0_T_T0_T1_\00", align 1
+// Check that, on MSVC, the same device kernel mangling name is generated.
+// MSVC: @0 = private unnamed_addr constant [43 x i8] c"_Z2k0IZZ2f1PfENKUlS0_E_clES0_EUlfE_EvS0_T_\00", align 1
+// MSVC: @1 = private unnamed_addr constant [60 x i8] c"_Z2k1IZ2f1PfEUlfE_Z2f1S0_EUlffE_Z2f1S0_EUlfE0_EvS0_T_T0_T1_\00", align 1
 
 __device__ float d0(float x) {
-  return [](float x) { return x + 2.f; }(x);
+  return [](float x) { return x + 1.f; }(x);
 }
 
 __device__ float d1(float x) {
@@ -14,11 +19,21 @@ __device__ float d1(float x) {
 }
 
 // DEVICE: amdgpu_kernel void @_Z2k0IZZ2f1PfENKUlS0_E_clES0_EUlfE_EvS0_T_(
+// DEVICE: define internal float @_ZZZ2f1PfENKUlS_E_clES_ENKUlfE_clEf(
 template <typename F>
 __global__ void k0(float *p, F f) {
   p[0] = f(p[0]) + d0(p[1]) + d1(p[2]);
 }
 
+// DEVICE: amdgpu_kernel void @_Z2k1IZ2f1PfEUlfE_Z2f1S0_EUlffE_Z2f1S0_EUlfE0_EvS0_T_T0_T1_(
+// DEVICE: define internal float @_ZZ2f1PfENKUlfE_clEf(
+// DEVICE: define internal float @_ZZ2f1PfENKUlffE_clEff(
+// DEVICE: define internal float @_ZZ2f1PfENKUlfE0_clEf(
+template <typename F0, typename F1, typename F2>
+__global__ void k1(float *p, F0 f0, F1 f1, F2 f2) {
+  p[0] = f0(p[0]) + f1(p[1], p[2]) + f2(p[3]);
+}
+
 void f0(float *p) {
   [](float *p) {
     *p = 1.f;
@@ -29,11 +44,17 @@ void f0(float *p) {
 // linkages are still required to keep the original `internal` linkage.
 
 // HOST: define internal void @_ZZ2f1PfENKUlS_E_clES_(
-// DEVICE: define internal float @_ZZZ2f1PfENKUlS_E_clES_ENKUlfE_clEf(
 void f1(float *p) {
   [](float *p) {
-    k0<<<1,1>>>(p, [] __device__ (float x) { return x + 1.f; });
+    k0<<<1,1>>>(p, [] __device__ (float x) { return x + 3.f; });
   }(p);
+  k1<<<1,1>>>(p,
+              [] __device__ (float x) { return x + 4.f; },
+              [] __device__ (float x, float y) { return x * y; },
+              [] __device__ (float x) { return x + 5.f; });
 }
 // HOST: @__hip_register_globals
 // HOST: __hipRegisterFunction{{.*}}@_Z17__device_stub__k0IZZ2f1PfENKUlS0_E_clES0_EUlfE_EvS0_T_{{.*}}@0
+// HOST: __hipRegisterFunction{{.*}}@_Z17__device_stub__k1IZ2f1PfEUlfE_Z2f1S0_EUlffE_Z2f1S0_EUlfE0_EvS0_T_T0_T1_{{.*}}@1
+// MSVC: __hipRegisterFunction{{.*}}@"??$k0@V<lambda_1>@?0???R1?0??f1@@YAXPEAM@Z@QEBA@0@Z@@@YAXPEAMV<lambda_1>@?0???R0?0??f1@@YAX0@Z@QEBA@0@Z@@Z{{.*}}@0
+// MSVC: __hipRegisterFunction{{.*}}@"??$k1@V<lambda_2>@?0??f1@@YAXPEAM@Z@V<lambda_3>@?0??2@YAX0@Z@V<lambda_4>@?0??2@YAX0@Z@@@YAXPEAMV<lambda_2>@?0??f1@@YAX0@Z@V<lambda_3>@?0??1@YAX0@Z@V<lambda_4>@?0??1@YAX0@Z@@Z{{.*}}@1

From 3979099a9b71dbf5e4d67fb3a5ae50c7afe707fa Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 9 Feb 2021 09:28:06 -0800
Subject: [PATCH 105/318] [RISCV] Remove SRO* and SLO* instructions from
 bitmanip.

As of the current draft these are no longer being considered
for the bitmanip spec. It wasn't clear what sub extension they
belonged in in the 0.93 spec.

So remove them. They can always be added back if something changes.

Reviewed By: frasercrmck

Differential Revision: https://reviews.llvm.org/D96157

(cherry picked from commit fd5adae02cafe388673d3b3f92ef791af3c73cfe)
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp |  87 ----
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h   |   3 -
 llvm/lib/Target/RISCV/RISCVInstrInfoB.td    |  67 ---
 llvm/test/CodeGen/RISCV/rv32Zbp.ll          | 504 --------------------
 llvm/test/CodeGen/RISCV/rv64Zbp.ll          | 306 ------------
 llvm/test/MC/RISCV/rv32zbp-invalid.s        |  18 -
 llvm/test/MC/RISCV/rv32zbp-valid.s          |  12 -
 llvm/test/MC/RISCV/rv64zbp-invalid.s        |  14 -
 llvm/test/MC/RISCV/rv64zbp-valid.s          |  12 -
 9 files changed, 1023 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 2121cc38f661..43bf16c53a62 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -826,93 +826,6 @@ bool RISCVDAGToDAGISel::MatchSRLIW(SDNode *N) const {
   return (Mask | maskTrailingOnes<uint64_t>(ShAmt)) == 0xffffffff;
 }
 
-// Check that it is a SLOI (Shift Left Ones Immediate). A PatFrag has already
-// determined it has the right structure:
-//
-//  (OR (SHL RS1, VC2), VC1)
-//
-// Check that VC1, the mask used to fill with ones, is compatible
-// with VC2, the shamt:
-//
-//  VC1 == maskTrailingOnes(VC2)
-//
-bool RISCVDAGToDAGISel::MatchSLOI(SDNode *N) const {
-  assert(N->getOpcode() == ISD::OR);
-  assert(N->getOperand(0).getOpcode() == ISD::SHL);
-  assert(isa<ConstantSDNode>(N->getOperand(1)));
-  assert(isa<ConstantSDNode>(N->getOperand(0).getOperand(1)));
-
-  SDValue Shl = N->getOperand(0);
-  if (Subtarget->is64Bit()) {
-    uint64_t VC1 = N->getConstantOperandVal(1);
-    uint64_t VC2 = Shl.getConstantOperandVal(1);
-    return VC1 == maskTrailingOnes<uint64_t>(VC2);
-  }
-
-  uint32_t VC1 = N->getConstantOperandVal(1);
-  uint32_t VC2 = Shl.getConstantOperandVal(1);
-  return VC1 == maskTrailingOnes<uint32_t>(VC2);
-}
-
-// Check that it is a SROI (Shift Right Ones Immediate). A PatFrag has already
-// determined it has the right structure:
-//
-//  (OR (SRL RS1, VC2), VC1)
-//
-// Check that VC1, the mask used to fill with ones, is compatible
-// with VC2, the shamt:
-//
-//  VC1 == maskLeadingOnes(VC2)
-//
-bool RISCVDAGToDAGISel::MatchSROI(SDNode *N) const {
-  assert(N->getOpcode() == ISD::OR);
-  assert(N->getOperand(0).getOpcode() == ISD::SRL);
-  assert(isa<ConstantSDNode>(N->getOperand(1)));
-  assert(isa<ConstantSDNode>(N->getOperand(0).getOperand(1)));
-
-  SDValue Srl = N->getOperand(0);
-  if (Subtarget->is64Bit()) {
-    uint64_t VC1 = N->getConstantOperandVal(1);
-    uint64_t VC2 = Srl.getConstantOperandVal(1);
-    return VC1 == maskLeadingOnes<uint64_t>(VC2);
-  }
-
-  uint32_t VC1 = N->getConstantOperandVal(1);
-  uint32_t VC2 = Srl.getConstantOperandVal(1);
-  return VC1 == maskLeadingOnes<uint32_t>(VC2);
-}
-
-// Check that it is a SROIW (Shift Right Ones Immediate i32 on RV64). A PatFrag
-// has already determined it has the right structure:
-//
-//  (OR (SRL RS1, VC2), VC1)
-//
-// and then we check that VC1, the mask used to fill with ones, is compatible
-// with VC2, the shamt:
-//
-//  VC2 < 32
-//  VC1 == maskTrailingZeros<uint64_t>(32 - VC2)
-//
-bool RISCVDAGToDAGISel::MatchSROIW(SDNode *N) const {
-  assert(N->getOpcode() == ISD::OR);
-  assert(N->getOperand(0).getOpcode() == ISD::SRL);
-  assert(isa<ConstantSDNode>(N->getOperand(1)));
-  assert(isa<ConstantSDNode>(N->getOperand(0).getOperand(1)));
-
-  // The IsRV64 predicate is checked after PatFrag predicates so we can get
-  // here even on RV32.
-  if (!Subtarget->is64Bit())
-    return false;
-
-  SDValue Srl = N->getOperand(0);
-  uint64_t VC1 = N->getConstantOperandVal(1);
-  uint64_t VC2 = Srl.getConstantOperandVal(1);
-
-  // Immediate range should be enforced by uimm5 predicate.
-  assert(VC2 < 32 && "Unexpected immediate");
-  return VC1 == maskTrailingZeros<uint64_t>(32 - VC2);
-}
-
 // Check that it is a SLLIUW (Shift Logical Left Immediate Unsigned i32
 // on RV64).
 // SLLIUW is the same as SLLI except for the fact that it clears the bits
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index 0c58c5379e13..6099586d049d 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -46,9 +46,6 @@ class RISCVDAGToDAGISel : public SelectionDAGISel {
   bool SelectAddrFI(SDValue Addr, SDValue &Base);
 
   bool MatchSRLIW(SDNode *N) const;
-  bool MatchSLOI(SDNode *N) const;
-  bool MatchSROI(SDNode *N) const;
-  bool MatchSROIW(SDNode *N) const;
   bool MatchSLLIUW(SDNode *N) const;
 
   bool selectVLOp(SDValue N, SDValue &VL);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
index 1bc288b5177c..7888ac7bac8e 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
@@ -45,25 +45,6 @@ def shfl_uimm : Operand<XLenVT>, ImmLeaf<XLenVT, [{
   }];
 }
 
-
-// Check that it is a SLOI (Shift Left Ones Immediate).
-def SLOIPat : PatFrag<(ops node:$A, node:$B),
-                      (or (shl node:$A, node:$B), imm), [{
-  return MatchSLOI(N);
-}]>;
-
-// Check that it is a SROI (Shift Right Ones Immediate).
-def SROIPat : PatFrag<(ops node:$A, node:$B),
-                      (or (srl node:$A, node:$B), imm), [{
-  return MatchSROI(N);
-}]>;
-
-// Check that it is a SROIW (Shift Right Ones Immediate i32 on RV64).
-def SROIWPat : PatFrag<(ops node:$A, node:$B),
-                       (or (srl node:$A, node:$B), imm), [{
-  return MatchSROIW(N);
-}]>;
-
 // Checks if this mask has a single 0 bit and cannot be used with ANDI.
 def BCLRMask : ImmLeaf<XLenVT, [{
   if (Subtarget->is64Bit())
@@ -210,11 +191,6 @@ def SH2ADD : ALU_rr<0b0010000, 0b100, "sh2add">, Sched<[]>;
 def SH3ADD : ALU_rr<0b0010000, 0b110, "sh3add">, Sched<[]>;
 } // Predicates = [HasStdExtZba]
 
-let Predicates = [HasStdExtZbp] in {
-def SLO  : ALU_rr<0b0010000, 0b001, "slo">, Sched<[]>;
-def SRO  : ALU_rr<0b0010000, 0b101, "sro">, Sched<[]>;
-} // Predicates = [HasStdExtZbp]
-
 let Predicates = [HasStdExtZbbOrZbp] in {
 def ROL   : ALU_rr<0b0110000, 0b001, "rol">, Sched<[]>;
 def ROR   : ALU_rr<0b0110000, 0b101, "ror">, Sched<[]>;
@@ -238,11 +214,6 @@ def XPERMB : ALU_rr<0b0010100, 0b100, "xperm.b">, Sched<[]>;
 def XPERMH : ALU_rr<0b0010100, 0b110, "xperm.h">, Sched<[]>;
 } // Predicates = [HasStdExtZbp]
 
-let Predicates = [HasStdExtZbp] in {
-def SLOI : RVBShift_ri<0b00100, 0b001, OPC_OP_IMM, "sloi">, Sched<[]>;
-def SROI : RVBShift_ri<0b00100, 0b101, OPC_OP_IMM, "sroi">, Sched<[]>;
-} // Predicates = [HasStdExtZbp]
-
 let Predicates = [HasStdExtZbbOrZbp] in
 def RORI  : RVBShift_ri<0b01100, 0b101, OPC_OP_IMM, "rori">, Sched<[]>;
 
@@ -369,11 +340,6 @@ def SH2ADDUW : ALUW_rr<0b0010000, 0b100, "sh2add.uw">, Sched<[]>;
 def SH3ADDUW : ALUW_rr<0b0010000, 0b110, "sh3add.uw">, Sched<[]>;
 } // Predicates = [HasStdExtZbb, IsRV64]
 
-let Predicates = [HasStdExtZbp, IsRV64] in {
-def SLOW   : ALUW_rr<0b0010000, 0b001, "slow">, Sched<[]>;
-def SROW   : ALUW_rr<0b0010000, 0b101, "srow">, Sched<[]>;
-} // Predicates = [HasStdExtZbp, IsRV64]
-
 let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
 def ROLW  : ALUW_rr<0b0110000, 0b001, "rolw">, Sched<[]>;
 def RORW  : ALUW_rr<0b0110000, 0b101, "rorw">, Sched<[]>;
@@ -395,11 +361,6 @@ let Predicates = [HasStdExtZbp, IsRV64] in {
 def XPERMW : ALU_rr<0b0010100, 0b000, "xperm.w">, Sched<[]>;
 } // Predicates = [HasStdExtZbp, IsRV64]
 
-let Predicates = [HasStdExtZbp, IsRV64] in {
-def SLOIW  : RVBShiftW_ri<0b0010000, 0b001, OPC_OP_IMM_32, "sloiw">, Sched<[]>;
-def SROIW  : RVBShiftW_ri<0b0010000, 0b101, OPC_OP_IMM_32, "sroiw">, Sched<[]>;
-} // Predicates = [HasStdExtZbp, IsRV64]
-
 let Predicates = [HasStdExtZbbOrZbp, IsRV64] in
 def RORIW : RVBShiftW_ri<0b0110000, 0b101, OPC_OP_IMM_32, "roriw">, Sched<[]>;
 
@@ -673,13 +634,6 @@ def : Pat<(or  GPR:$rs1, (not GPR:$rs2)), (ORN  GPR:$rs1, GPR:$rs2)>;
 def : Pat<(xor GPR:$rs1, (not GPR:$rs2)), (XNOR GPR:$rs1, GPR:$rs2)>;
 } // Predicates = [HasStdExtZbbOrZbp]
 
-let Predicates = [HasStdExtZbp] in {
-def : Pat<(not (shiftop<shl> (not GPR:$rs1), GPR:$rs2)),
-          (SLO GPR:$rs1, GPR:$rs2)>;
-def : Pat<(not (shiftop<srl> (not GPR:$rs1), GPR:$rs2)),
-          (SRO GPR:$rs1, GPR:$rs2)>;
-} // Predicates = [HasStdExtZbp]
-
 let Predicates = [HasStdExtZbbOrZbp] in {
 def : Pat<(rotl GPR:$rs1, GPR:$rs2), (ROL GPR:$rs1, GPR:$rs2)>;
 def : Pat<(rotr GPR:$rs1, GPR:$rs2), (ROR GPR:$rs1, GPR:$rs2)>;
@@ -710,13 +664,6 @@ def : Pat<(and (srl GPR:$rs1, uimmlog2xlen:$shamt), (XLenVT 1)),
           (BEXTI GPR:$rs1, uimmlog2xlen:$shamt)>;
 }
 
-let Predicates = [HasStdExtZbp] in {
-def : Pat<(SLOIPat GPR:$rs1, uimmlog2xlen:$shamt),
-          (SLOI GPR:$rs1, uimmlog2xlen:$shamt)>;
-def : Pat<(SROIPat GPR:$rs1, uimmlog2xlen:$shamt),
-          (SROI GPR:$rs1, uimmlog2xlen:$shamt)>;
-} // Predicates = [HasStdExtZbp]
-
 // There's no encoding for roli in the the 'B' extension as it can be
 // implemented with rori by negating the immediate.
 let Predicates = [HasStdExtZbbOrZbp] in {
@@ -936,13 +883,6 @@ def : Pat<(add (SLLIUWPat GPR:$rs1, (XLenVT 3)), GPR:$rs2),
           (SH3ADDUW GPR:$rs1, GPR:$rs2)>;
 } // Predicates = [HasStdExtZba, IsRV64]
 
-let Predicates = [HasStdExtZbp, IsRV64] in {
-def : Pat<(not (shiftopw<riscv_sllw> (not GPR:$rs1), GPR:$rs2)),
-          (SLOW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(not (shiftopw<riscv_srlw> (not GPR:$rs1), GPR:$rs2)),
-          (SROW GPR:$rs1, GPR:$rs2)>;
-} // Predicates = [HasStdExtZbp, IsRV64]
-
 let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
 def : Pat<(riscv_rolw GPR:$rs1, GPR:$rs2),
           (ROLW GPR:$rs1, GPR:$rs2)>;
@@ -982,13 +922,6 @@ def : Pat<(xor (assertsexti32 GPR:$rs1), BSETINVWMask:$mask),
 
 } // Predicates = [HasStdExtZbs, IsRV64]
 
-let Predicates = [HasStdExtZbp, IsRV64] in {
-def : Pat<(sext_inreg (SLOIPat GPR:$rs1, uimm5:$shamt), i32),
-          (SLOIW GPR:$rs1, uimm5:$shamt)>;
-def : Pat<(SROIWPat GPR:$rs1, uimm5:$shamt),
-          (SROIW GPR:$rs1, uimm5:$shamt)>;
-} // Predicates = [HasStdExtZbp, IsRV64]
-
 let Predicates = [HasStdExtZbp, IsRV64] in {
 def : Pat<(riscv_rorw (riscv_greviw GPR:$rs1, 24), (i64 16)), (GREVIW GPR:$rs1, 8)>;
 def : Pat<(riscv_rolw (riscv_greviw GPR:$rs1, 24), (i64 16)), (GREVIW GPR:$rs1, 8)>;
diff --git a/llvm/test/CodeGen/RISCV/rv32Zbp.ll b/llvm/test/CodeGen/RISCV/rv32Zbp.ll
index de315dfb2d5a..ec1720337dda 100644
--- a/llvm/test/CodeGen/RISCV/rv32Zbp.ll
+++ b/llvm/test/CodeGen/RISCV/rv32Zbp.ll
@@ -6,510 +6,6 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zbp -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s -check-prefix=RV32IBP
 
-define i32 @slo_i32(i32 %a, i32 %b) nounwind {
-; RV32I-LABEL: slo_i32:
-; RV32I:       # %bb.0:
-; RV32I-NEXT:    not a0, a0
-; RV32I-NEXT:    sll a0, a0, a1
-; RV32I-NEXT:    not a0, a0
-; RV32I-NEXT:    ret
-;
-; RV32IB-LABEL: slo_i32:
-; RV32IB:       # %bb.0:
-; RV32IB-NEXT:    slo a0, a0, a1
-; RV32IB-NEXT:    ret
-;
-; RV32IBP-LABEL: slo_i32:
-; RV32IBP:       # %bb.0:
-; RV32IBP-NEXT:    slo a0, a0, a1
-; RV32IBP-NEXT:    ret
-  %neg = xor i32 %a, -1
-  %shl = shl i32 %neg, %b
-  %neg1 = xor i32 %shl, -1
-  ret i32 %neg1
-}
-
-define i32 @slo_i32_mask(i32 %a, i32 %b) nounwind {
-; RV32I-LABEL: slo_i32_mask:
-; RV32I:       # %bb.0:
-; RV32I-NEXT:    not a0, a0
-; RV32I-NEXT:    sll a0, a0, a1
-; RV32I-NEXT:    not a0, a0
-; RV32I-NEXT:    ret
-;
-; RV32IB-LABEL: slo_i32_mask:
-; RV32IB:       # %bb.0:
-; RV32IB-NEXT:    slo a0, a0, a1
-; RV32IB-NEXT:    ret
-;
-; RV32IBP-LABEL: slo_i32_mask:
-; RV32IBP:       # %bb.0:
-; RV32IBP-NEXT:    slo a0, a0, a1
-; RV32IBP-NEXT:    ret
-  %neg = xor i32 %a, -1
-  %and = and i32 %b, 31
-  %shl = shl i32 %neg, %and
-  %neg1 = xor i32 %shl, -1
-  ret i32 %neg1
-}
-
-; As we are not matching directly i64 code patterns on RV32 some i64 patterns
-; don't have yet any matching bit manipulation instructions on RV32.
-; This test is presented here in case future expansions of the experimental-b
-; extension introduce instructions suitable for this pattern.
-
-define i64 @slo_i64(i64 %a, i64 %b) nounwind {
-; RV32I-LABEL: slo_i64:
-; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi a3, a2, -32
-; RV32I-NEXT:    not a0, a0
-; RV32I-NEXT:    bltz a3, .LBB2_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv a2, zero
-; RV32I-NEXT:    sll a1, a0, a3
-; RV32I-NEXT:    j .LBB2_3
-; RV32I-NEXT:  .LBB2_2:
-; RV32I-NEXT:    not a1, a1
-; RV32I-NEXT:    sll a1, a1, a2
-; RV32I-NEXT:    addi a3, zero, 31
-; RV32I-NEXT:    sub a3, a3, a2
-; RV32I-NEXT:    srli a4, a0, 1
-; RV32I-NEXT:    srl a3, a4, a3
-; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    sll a2, a0, a2
-; RV32I-NEXT:  .LBB2_3:
-; RV32I-NEXT:    not a1, a1
-; RV32I-NEXT:    not a0, a2
-; RV32I-NEXT:    ret
-;
-; RV32IB-LABEL: slo_i64:
-; RV32IB:       # %bb.0:
-; RV32IB-NEXT:    not a0, a0
-; RV32IB-NEXT:    not a1, a1
-; RV32IB-NEXT:    sll a1, a1, a2
-; RV32IB-NEXT:    addi a3, zero, 31
-; RV32IB-NEXT:    sub a3, a3, a2
-; RV32IB-NEXT:    srli a4, a0, 1
-; RV32IB-NEXT:    srl a3, a4, a3
-; RV32IB-NEXT:    or a1, a1, a3
-; RV32IB-NEXT:    addi a3, a2, -32
-; RV32IB-NEXT:    sll a4, a0, a3
-; RV32IB-NEXT:    slti a5, a3, 0
-; RV32IB-NEXT:    cmov a1, a5, a1, a4
-; RV32IB-NEXT:    sll a0, a0, a2
-; RV32IB-NEXT:    srai a2, a3, 31
-; RV32IB-NEXT:    and a0, a2, a0
-; RV32IB-NEXT:    not a1, a1
-; RV32IB-NEXT:    not a0, a0
-; RV32IB-NEXT:    ret
-;
-; RV32IBP-LABEL: slo_i64:
-; RV32IBP:       # %bb.0:
-; RV32IBP-NEXT:    addi a3, a2, -32
-; RV32IBP-NEXT:    not a0, a0
-; RV32IBP-NEXT:    bltz a3, .LBB2_2
-; RV32IBP-NEXT:  # %bb.1:
-; RV32IBP-NEXT:    mv a2, zero
-; RV32IBP-NEXT:    sll a1, a0, a3
-; RV32IBP-NEXT:    j .LBB2_3
-; RV32IBP-NEXT:  .LBB2_2:
-; RV32IBP-NEXT:    not a1, a1
-; RV32IBP-NEXT:    sll a1, a1, a2
-; RV32IBP-NEXT:    addi a3, zero, 31
-; RV32IBP-NEXT:    sub a3, a3, a2
-; RV32IBP-NEXT:    srli a4, a0, 1
-; RV32IBP-NEXT:    srl a3, a4, a3
-; RV32IBP-NEXT:    or a1, a1, a3
-; RV32IBP-NEXT:    sll a2, a0, a2
-; RV32IBP-NEXT:  .LBB2_3:
-; RV32IBP-NEXT:    not a1, a1
-; RV32IBP-NEXT:    not a0, a2
-; RV32IBP-NEXT:    ret
-  %neg = xor i64 %a, -1
-  %shl = shl i64 %neg, %b
-  %neg1 = xor i64 %shl, -1
-  ret i64 %neg1
-}
-
-define i64 @slo_i64_mask(i64 %a, i64 %b) nounwind {
-; RV32I-LABEL: slo_i64_mask:
-; RV32I:       # %bb.0:
-; RV32I-NEXT:    andi a3, a2, 63
-; RV32I-NEXT:    addi a4, a3, -32
-; RV32I-NEXT:    not a0, a0
-; RV32I-NEXT:    bltz a4, .LBB3_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv a2, zero
-; RV32I-NEXT:    sll a1, a0, a4
-; RV32I-NEXT:    j .LBB3_3
-; RV32I-NEXT:  .LBB3_2:
-; RV32I-NEXT:    not a1, a1
-; RV32I-NEXT:    sll a1, a1, a2
-; RV32I-NEXT:    addi a4, zero, 31
-; RV32I-NEXT:    sub a3, a4, a3
-; RV32I-NEXT:    srli a4, a0, 1
-; RV32I-NEXT:    srl a3, a4, a3
-; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    sll a2, a0, a2
-; RV32I-NEXT:  .LBB3_3:
-; RV32I-NEXT:    not a1, a1
-; RV32I-NEXT:    not a0, a2
-; RV32I-NEXT:    ret
-;
-; RV32IB-LABEL: slo_i64_mask:
-; RV32IB:       # %bb.0:
-; RV32IB-NEXT:    not a0, a0
-; RV32IB-NEXT:    not a1, a1
-; RV32IB-NEXT:    sll a1, a1, a2
-; RV32IB-NEXT:    andi a3, a2, 63
-; RV32IB-NEXT:    addi a4, zero, 31
-; RV32IB-NEXT:    sub a4, a4, a3
-; RV32IB-NEXT:    srli a5, a0, 1
-; RV32IB-NEXT:    srl a4, a5, a4
-; RV32IB-NEXT:    or a1, a1, a4
-; RV32IB-NEXT:    addi a3, a3, -32
-; RV32IB-NEXT:    sll a4, a0, a3
-; RV32IB-NEXT:    slti a5, a3, 0
-; RV32IB-NEXT:    cmov a1, a5, a1, a4
-; RV32IB-NEXT:    sll a0, a0, a2
-; RV32IB-NEXT:    srai a2, a3, 31
-; RV32IB-NEXT:    and a0, a2, a0
-; RV32IB-NEXT:    not a1, a1
-; RV32IB-NEXT:    not a0, a0
-; RV32IB-NEXT:    ret
-;
-; RV32IBP-LABEL: slo_i64_mask:
-; RV32IBP:       # %bb.0:
-; RV32IBP-NEXT:    andi a3, a2, 63
-; RV32IBP-NEXT:    addi a4, a3, -32
-; RV32IBP-NEXT:    not a0, a0
-; RV32IBP-NEXT:    bltz a4, .LBB3_2
-; RV32IBP-NEXT:  # %bb.1:
-; RV32IBP-NEXT:    mv a2, zero
-; RV32IBP-NEXT:    sll a1, a0, a4
-; RV32IBP-NEXT:    j .LBB3_3
-; RV32IBP-NEXT:  .LBB3_2:
-; RV32IBP-NEXT:    not a1, a1
-; RV32IBP-NEXT:    sll a1, a1, a2
-; RV32IBP-NEXT:    addi a4, zero, 31
-; RV32IBP-NEXT:    sub a3, a4, a3
-; RV32IBP-NEXT:    srli a4, a0, 1
-; RV32IBP-NEXT:    srl a3, a4, a3
-; RV32IBP-NEXT:    or a1, a1, a3
-; RV32IBP-NEXT:    sll a2, a0, a2
-; RV32IBP-NEXT:  .LBB3_3:
-; RV32IBP-NEXT:    not a1, a1
-; RV32IBP-NEXT:    not a0, a2
-; RV32IBP-NEXT:    ret
-  %neg = xor i64 %a, -1
-  %and = and i64 %b, 63
-  %shl = shl i64 %neg, %and
-  %neg1 = xor i64 %shl, -1
-  ret i64 %neg1
-}
-
-define i32 @sro_i32(i32 %a, i32 %b) nounwind {
-; RV32I-LABEL: sro_i32:
-; RV32I:       # %bb.0:
-; RV32I-NEXT:    not a0, a0
-; RV32I-NEXT:    srl a0, a0, a1
-; RV32I-NEXT:    not a0, a0
-; RV32I-NEXT:    ret
-;
-; RV32IB-LABEL: sro_i32:
-; RV32IB:       # %bb.0:
-; RV32IB-NEXT:    sro a0, a0, a1
-; RV32IB-NEXT:    ret
-;
-; RV32IBP-LABEL: sro_i32:
-; RV32IBP:       # %bb.0:
-; RV32IBP-NEXT:    sro a0, a0, a1
-; RV32IBP-NEXT:    ret
-  %neg = xor i32 %a, -1
-  %shr = lshr i32 %neg, %b
-  %neg1 = xor i32 %shr, -1
-  ret i32 %neg1
-}
-
-define i32 @sro_i32_mask(i32 %a, i32 %b) nounwind {
-; RV32I-LABEL: sro_i32_mask:
-; RV32I:       # %bb.0:
-; RV32I-NEXT:    not a0, a0
-; RV32I-NEXT:    srl a0, a0, a1
-; RV32I-NEXT:    not a0, a0
-; RV32I-NEXT:    ret
-;
-; RV32IB-LABEL: sro_i32_mask:
-; RV32IB:       # %bb.0:
-; RV32IB-NEXT:    sro a0, a0, a1
-; RV32IB-NEXT:    ret
-;
-; RV32IBP-LABEL: sro_i32_mask:
-; RV32IBP:       # %bb.0:
-; RV32IBP-NEXT:    sro a0, a0, a1
-; RV32IBP-NEXT:    ret
-  %neg = xor i32 %a, -1
-  %and = and i32 %b, 31
-  %shr = lshr i32 %neg, %and
-  %neg1 = xor i32 %shr, -1
-  ret i32 %neg1
-}
-
-; As we are not matching directly i64 code patterns on RV32 some i64 patterns
-; don't have yet any matching bit manipulation instructions on RV32.
-; This test is presented here in case future expansions of the experimental-b
-; extension introduce instructions suitable for this pattern.
-
-define i64 @sro_i64(i64 %a, i64 %b) nounwind {
-; RV32I-LABEL: sro_i64:
-; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi a3, a2, -32
-; RV32I-NEXT:    not a1, a1
-; RV32I-NEXT:    bltz a3, .LBB6_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv a2, zero
-; RV32I-NEXT:    srl a0, a1, a3
-; RV32I-NEXT:    j .LBB6_3
-; RV32I-NEXT:  .LBB6_2:
-; RV32I-NEXT:    not a0, a0
-; RV32I-NEXT:    srl a0, a0, a2
-; RV32I-NEXT:    addi a3, zero, 31
-; RV32I-NEXT:    sub a3, a3, a2
-; RV32I-NEXT:    slli a4, a1, 1
-; RV32I-NEXT:    sll a3, a4, a3
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    srl a2, a1, a2
-; RV32I-NEXT:  .LBB6_3:
-; RV32I-NEXT:    not a0, a0
-; RV32I-NEXT:    not a1, a2
-; RV32I-NEXT:    ret
-;
-; RV32IB-LABEL: sro_i64:
-; RV32IB:       # %bb.0:
-; RV32IB-NEXT:    not a1, a1
-; RV32IB-NEXT:    not a0, a0
-; RV32IB-NEXT:    srl a0, a0, a2
-; RV32IB-NEXT:    addi a3, zero, 31
-; RV32IB-NEXT:    sub a3, a3, a2
-; RV32IB-NEXT:    slli a4, a1, 1
-; RV32IB-NEXT:    sll a3, a4, a3
-; RV32IB-NEXT:    or a0, a0, a3
-; RV32IB-NEXT:    addi a3, a2, -32
-; RV32IB-NEXT:    srl a4, a1, a3
-; RV32IB-NEXT:    slti a5, a3, 0
-; RV32IB-NEXT:    cmov a0, a5, a0, a4
-; RV32IB-NEXT:    srl a1, a1, a2
-; RV32IB-NEXT:    srai a2, a3, 31
-; RV32IB-NEXT:    and a1, a2, a1
-; RV32IB-NEXT:    not a0, a0
-; RV32IB-NEXT:    not a1, a1
-; RV32IB-NEXT:    ret
-;
-; RV32IBP-LABEL: sro_i64:
-; RV32IBP:       # %bb.0:
-; RV32IBP-NEXT:    addi a3, a2, -32
-; RV32IBP-NEXT:    not a1, a1
-; RV32IBP-NEXT:    bltz a3, .LBB6_2
-; RV32IBP-NEXT:  # %bb.1:
-; RV32IBP-NEXT:    mv a2, zero
-; RV32IBP-NEXT:    srl a0, a1, a3
-; RV32IBP-NEXT:    j .LBB6_3
-; RV32IBP-NEXT:  .LBB6_2:
-; RV32IBP-NEXT:    not a0, a0
-; RV32IBP-NEXT:    srl a0, a0, a2
-; RV32IBP-NEXT:    addi a3, zero, 31
-; RV32IBP-NEXT:    sub a3, a3, a2
-; RV32IBP-NEXT:    slli a4, a1, 1
-; RV32IBP-NEXT:    sll a3, a4, a3
-; RV32IBP-NEXT:    or a0, a0, a3
-; RV32IBP-NEXT:    srl a2, a1, a2
-; RV32IBP-NEXT:  .LBB6_3:
-; RV32IBP-NEXT:    not a0, a0
-; RV32IBP-NEXT:    not a1, a2
-; RV32IBP-NEXT:    ret
-  %neg = xor i64 %a, -1
-  %shr = lshr i64 %neg, %b
-  %neg1 = xor i64 %shr, -1
-  ret i64 %neg1
-}
-
-define i64 @sro_i64_mask(i64 %a, i64 %b) nounwind {
-; RV32I-LABEL: sro_i64_mask:
-; RV32I:       # %bb.0:
-; RV32I-NEXT:    andi a3, a2, 63
-; RV32I-NEXT:    addi a4, a3, -32
-; RV32I-NEXT:    not a1, a1
-; RV32I-NEXT:    bltz a4, .LBB7_2
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv a2, zero
-; RV32I-NEXT:    srl a0, a1, a4
-; RV32I-NEXT:    j .LBB7_3
-; RV32I-NEXT:  .LBB7_2:
-; RV32I-NEXT:    not a0, a0
-; RV32I-NEXT:    srl a0, a0, a2
-; RV32I-NEXT:    addi a4, zero, 31
-; RV32I-NEXT:    sub a3, a4, a3
-; RV32I-NEXT:    slli a4, a1, 1
-; RV32I-NEXT:    sll a3, a4, a3
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    srl a2, a1, a2
-; RV32I-NEXT:  .LBB7_3:
-; RV32I-NEXT:    not a0, a0
-; RV32I-NEXT:    not a1, a2
-; RV32I-NEXT:    ret
-;
-; RV32IB-LABEL: sro_i64_mask:
-; RV32IB:       # %bb.0:
-; RV32IB-NEXT:    not a1, a1
-; RV32IB-NEXT:    not a0, a0
-; RV32IB-NEXT:    srl a0, a0, a2
-; RV32IB-NEXT:    andi a3, a2, 63
-; RV32IB-NEXT:    addi a4, zero, 31
-; RV32IB-NEXT:    sub a4, a4, a3
-; RV32IB-NEXT:    slli a5, a1, 1
-; RV32IB-NEXT:    sll a4, a5, a4
-; RV32IB-NEXT:    or a0, a0, a4
-; RV32IB-NEXT:    addi a3, a3, -32
-; RV32IB-NEXT:    srl a4, a1, a3
-; RV32IB-NEXT:    slti a5, a3, 0
-; RV32IB-NEXT:    cmov a0, a5, a0, a4
-; RV32IB-NEXT:    srl a1, a1, a2
-; RV32IB-NEXT:    srai a2, a3, 31
-; RV32IB-NEXT:    and a1, a2, a1
-; RV32IB-NEXT:    not a0, a0
-; RV32IB-NEXT:    not a1, a1
-; RV32IB-NEXT:    ret
-;
-; RV32IBP-LABEL: sro_i64_mask:
-; RV32IBP:       # %bb.0:
-; RV32IBP-NEXT:    andi a3, a2, 63
-; RV32IBP-NEXT:    addi a4, a3, -32
-; RV32IBP-NEXT:    not a1, a1
-; RV32IBP-NEXT:    bltz a4, .LBB7_2
-; RV32IBP-NEXT:  # %bb.1:
-; RV32IBP-NEXT:    mv a2, zero
-; RV32IBP-NEXT:    srl a0, a1, a4
-; RV32IBP-NEXT:    j .LBB7_3
-; RV32IBP-NEXT:  .LBB7_2:
-; RV32IBP-NEXT:    not a0, a0
-; RV32IBP-NEXT:    srl a0, a0, a2
-; RV32IBP-NEXT:    addi a4, zero, 31
-; RV32IBP-NEXT:    sub a3, a4, a3
-; RV32IBP-NEXT:    slli a4, a1, 1
-; RV32IBP-NEXT:    sll a3, a4, a3
-; RV32IBP-NEXT:    or a0, a0, a3
-; RV32IBP-NEXT:    srl a2, a1, a2
-; RV32IBP-NEXT:  .LBB7_3:
-; RV32IBP-NEXT:    not a0, a0
-; RV32IBP-NEXT:    not a1, a2
-; RV32IBP-NEXT:    ret
-  %neg = xor i64 %a, -1
-  %and = and i64 %b, 63
-  %shr = lshr i64 %neg, %and
-  %neg1 = xor i64 %shr, -1
-  ret i64 %neg1
-}
-
-define i32 @sloi_i32(i32 %a) nounwind {
-; RV32I-LABEL: sloi_i32:
-; RV32I:       # %bb.0:
-; RV32I-NEXT:    slli a0, a0, 1
-; RV32I-NEXT:    ori a0, a0, 1
-; RV32I-NEXT:    ret
-;
-; RV32IB-LABEL: sloi_i32:
-; RV32IB:       # %bb.0:
-; RV32IB-NEXT:    sloi a0, a0, 1
-; RV32IB-NEXT:    ret
-;
-; RV32IBP-LABEL: sloi_i32:
-; RV32IBP:       # %bb.0:
-; RV32IBP-NEXT:    sloi a0, a0, 1
-; RV32IBP-NEXT:    ret
-  %neg = shl i32 %a, 1
-  %neg12 = or i32 %neg, 1
-  ret i32 %neg12
-}
-
-define i64 @sloi_i64(i64 %a) nounwind {
-; RV32I-LABEL: sloi_i64:
-; RV32I:       # %bb.0:
-; RV32I-NEXT:    srli a2, a0, 31
-; RV32I-NEXT:    slli a1, a1, 1
-; RV32I-NEXT:    or a1, a1, a2
-; RV32I-NEXT:    slli a0, a0, 1
-; RV32I-NEXT:    ori a0, a0, 1
-; RV32I-NEXT:    ret
-;
-; RV32IB-LABEL: sloi_i64:
-; RV32IB:       # %bb.0:
-; RV32IB-NEXT:    fsri a1, a0, a1, 31
-; RV32IB-NEXT:    sloi a0, a0, 1
-; RV32IB-NEXT:    ret
-;
-; RV32IBP-LABEL: sloi_i64:
-; RV32IBP:       # %bb.0:
-; RV32IBP-NEXT:    srli a2, a0, 31
-; RV32IBP-NEXT:    slli a1, a1, 1
-; RV32IBP-NEXT:    or a1, a1, a2
-; RV32IBP-NEXT:    sloi a0, a0, 1
-; RV32IBP-NEXT:    ret
-  %neg = shl i64 %a, 1
-  %neg12 = or i64 %neg, 1
-  ret i64 %neg12
-}
-
-define i32 @sroi_i32(i32 %a) nounwind {
-; RV32I-LABEL: sroi_i32:
-; RV32I:       # %bb.0:
-; RV32I-NEXT:    srli a0, a0, 1
-; RV32I-NEXT:    lui a1, 524288
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    ret
-;
-; RV32IB-LABEL: sroi_i32:
-; RV32IB:       # %bb.0:
-; RV32IB-NEXT:    sroi a0, a0, 1
-; RV32IB-NEXT:    ret
-;
-; RV32IBP-LABEL: sroi_i32:
-; RV32IBP:       # %bb.0:
-; RV32IBP-NEXT:    sroi a0, a0, 1
-; RV32IBP-NEXT:    ret
-  %neg = lshr i32 %a, 1
-  %neg12 = or i32 %neg, -2147483648
-  ret i32 %neg12
-}
-
-define i64 @sroi_i64(i64 %a) nounwind {
-; RV32I-LABEL: sroi_i64:
-; RV32I:       # %bb.0:
-; RV32I-NEXT:    slli a2, a1, 31
-; RV32I-NEXT:    srli a0, a0, 1
-; RV32I-NEXT:    or a0, a0, a2
-; RV32I-NEXT:    srli a1, a1, 1
-; RV32I-NEXT:    lui a2, 524288
-; RV32I-NEXT:    or a1, a1, a2
-; RV32I-NEXT:    ret
-;
-; RV32IB-LABEL: sroi_i64:
-; RV32IB:       # %bb.0:
-; RV32IB-NEXT:    fsri a0, a0, a1, 1
-; RV32IB-NEXT:    sroi a1, a1, 1
-; RV32IB-NEXT:    ret
-;
-; RV32IBP-LABEL: sroi_i64:
-; RV32IBP:       # %bb.0:
-; RV32IBP-NEXT:    slli a2, a1, 31
-; RV32IBP-NEXT:    srli a0, a0, 1
-; RV32IBP-NEXT:    or a0, a0, a2
-; RV32IBP-NEXT:    sroi a1, a1, 1
-; RV32IBP-NEXT:    ret
-  %neg = lshr i64 %a, 1
-  %neg12 = or i64 %neg, -9223372036854775808
-  ret i64 %neg12
-}
-
 define i32 @gorc1_i32(i32 %a) nounwind {
 ; RV32I-LABEL: gorc1_i32:
 ; RV32I:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rv64Zbp.ll b/llvm/test/CodeGen/RISCV/rv64Zbp.ll
index d2191afd5b79..685c1a0225ac 100644
--- a/llvm/test/CodeGen/RISCV/rv64Zbp.ll
+++ b/llvm/test/CodeGen/RISCV/rv64Zbp.ll
@@ -6,312 +6,6 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zbp -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s -check-prefix=RV64IBP
 
-define signext i32 @slo_i32(i32 signext %a, i32 signext %b) nounwind {
-; RV64I-LABEL: slo_i32:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    sllw a0, a0, a1
-; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    ret
-;
-; RV64IB-LABEL: slo_i32:
-; RV64IB:       # %bb.0:
-; RV64IB-NEXT:    slow a0, a0, a1
-; RV64IB-NEXT:    ret
-;
-; RV64IBP-LABEL: slo_i32:
-; RV64IBP:       # %bb.0:
-; RV64IBP-NEXT:    slow a0, a0, a1
-; RV64IBP-NEXT:    ret
-  %neg = xor i32 %a, -1
-  %shl = shl i32 %neg, %b
-  %neg1 = xor i32 %shl, -1
-  ret i32 %neg1
-}
-
-define signext i32 @slo_i32_mask(i32 signext %a, i32 signext %b) nounwind {
-; RV64I-LABEL: slo_i32_mask:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    sllw a0, a0, a1
-; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    ret
-;
-; RV64IB-LABEL: slo_i32_mask:
-; RV64IB:       # %bb.0:
-; RV64IB-NEXT:    slow a0, a0, a1
-; RV64IB-NEXT:    ret
-;
-; RV64IBP-LABEL: slo_i32_mask:
-; RV64IBP:       # %bb.0:
-; RV64IBP-NEXT:    slow a0, a0, a1
-; RV64IBP-NEXT:    ret
-  %neg = xor i32 %a, -1
-  %and = and i32 %b, 31
-  %shl = shl i32 %neg, %and
-  %neg1 = xor i32 %shl, -1
-  ret i32 %neg1
-}
-
-define i64 @slo_i64(i64 %a, i64 %b) nounwind {
-; RV64I-LABEL: slo_i64:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    sll a0, a0, a1
-; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    ret
-;
-; RV64IB-LABEL: slo_i64:
-; RV64IB:       # %bb.0:
-; RV64IB-NEXT:    slo a0, a0, a1
-; RV64IB-NEXT:    ret
-;
-; RV64IBP-LABEL: slo_i64:
-; RV64IBP:       # %bb.0:
-; RV64IBP-NEXT:    slo a0, a0, a1
-; RV64IBP-NEXT:    ret
-  %neg = xor i64 %a, -1
-  %shl = shl i64 %neg, %b
-  %neg1 = xor i64 %shl, -1
-  ret i64 %neg1
-}
-
-define i64 @slo_i64_mask(i64 %a, i64 %b) nounwind {
-; RV64I-LABEL: slo_i64_mask:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    sll a0, a0, a1
-; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    ret
-;
-; RV64IB-LABEL: slo_i64_mask:
-; RV64IB:       # %bb.0:
-; RV64IB-NEXT:    slo a0, a0, a1
-; RV64IB-NEXT:    ret
-;
-; RV64IBP-LABEL: slo_i64_mask:
-; RV64IBP:       # %bb.0:
-; RV64IBP-NEXT:    slo a0, a0, a1
-; RV64IBP-NEXT:    ret
-  %neg = xor i64 %a, -1
-  %and = and i64 %b, 63
-  %shl = shl i64 %neg, %and
-  %neg1 = xor i64 %shl, -1
-  ret i64 %neg1
-}
-
-define signext i32 @sro_i32(i32 signext %a, i32 signext %b) nounwind {
-; RV64I-LABEL: sro_i32:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    srlw a0, a0, a1
-; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    ret
-;
-; RV64IB-LABEL: sro_i32:
-; RV64IB:       # %bb.0:
-; RV64IB-NEXT:    srow a0, a0, a1
-; RV64IB-NEXT:    ret
-;
-; RV64IBP-LABEL: sro_i32:
-; RV64IBP:       # %bb.0:
-; RV64IBP-NEXT:    srow a0, a0, a1
-; RV64IBP-NEXT:    ret
-  %neg = xor i32 %a, -1
-  %shr = lshr i32 %neg, %b
-  %neg1 = xor i32 %shr, -1
-  ret i32 %neg1
-}
-
-define signext i32 @sro_i32_mask(i32 signext %a, i32 signext %b) nounwind {
-; RV64I-LABEL: sro_i32_mask:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    srlw a0, a0, a1
-; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    ret
-;
-; RV64IB-LABEL: sro_i32_mask:
-; RV64IB:       # %bb.0:
-; RV64IB-NEXT:    srow a0, a0, a1
-; RV64IB-NEXT:    ret
-;
-; RV64IBP-LABEL: sro_i32_mask:
-; RV64IBP:       # %bb.0:
-; RV64IBP-NEXT:    srow a0, a0, a1
-; RV64IBP-NEXT:    ret
-  %neg = xor i32 %a, -1
-  %and = and i32 %b, 31
-  %shr = lshr i32 %neg, %and
-  %neg1 = xor i32 %shr, -1
-  ret i32 %neg1
-}
-
-define i64 @sro_i64(i64 %a, i64 %b) nounwind {
-; RV64I-LABEL: sro_i64:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    srl a0, a0, a1
-; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    ret
-;
-; RV64IB-LABEL: sro_i64:
-; RV64IB:       # %bb.0:
-; RV64IB-NEXT:    sro a0, a0, a1
-; RV64IB-NEXT:    ret
-;
-; RV64IBP-LABEL: sro_i64:
-; RV64IBP:       # %bb.0:
-; RV64IBP-NEXT:    sro a0, a0, a1
-; RV64IBP-NEXT:    ret
-  %neg = xor i64 %a, -1
-  %shr = lshr i64 %neg, %b
-  %neg1 = xor i64 %shr, -1
-  ret i64 %neg1
-}
-
-define i64 @sro_i64_mask(i64 %a, i64 %b) nounwind {
-; RV64I-LABEL: sro_i64_mask:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    srl a0, a0, a1
-; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    ret
-;
-; RV64IB-LABEL: sro_i64_mask:
-; RV64IB:       # %bb.0:
-; RV64IB-NEXT:    sro a0, a0, a1
-; RV64IB-NEXT:    ret
-;
-; RV64IBP-LABEL: sro_i64_mask:
-; RV64IBP:       # %bb.0:
-; RV64IBP-NEXT:    sro a0, a0, a1
-; RV64IBP-NEXT:    ret
-  %neg = xor i64 %a, -1
-  %and = and i64 %b, 63
-  %shr = lshr i64 %neg, %and
-  %neg1 = xor i64 %shr, -1
-  ret i64 %neg1
-}
-
-define signext i32 @sloi_i32(i32 signext %a) nounwind {
-; RV64I-LABEL: sloi_i32:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a0, a0, 1
-; RV64I-NEXT:    ori a0, a0, 1
-; RV64I-NEXT:    sext.w a0, a0
-; RV64I-NEXT:    ret
-;
-; RV64IB-LABEL: sloi_i32:
-; RV64IB:       # %bb.0:
-; RV64IB-NEXT:    sloiw a0, a0, 1
-; RV64IB-NEXT:    ret
-;
-; RV64IBP-LABEL: sloi_i32:
-; RV64IBP:       # %bb.0:
-; RV64IBP-NEXT:    sloiw a0, a0, 1
-; RV64IBP-NEXT:    ret
-  %neg = shl i32 %a, 1
-  %neg12 = or i32 %neg, 1
-  ret i32 %neg12
-}
-
-define i64 @sloi_i64(i64 %a) nounwind {
-; RV64I-LABEL: sloi_i64:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a0, a0, 1
-; RV64I-NEXT:    ori a0, a0, 1
-; RV64I-NEXT:    ret
-;
-; RV64IB-LABEL: sloi_i64:
-; RV64IB:       # %bb.0:
-; RV64IB-NEXT:    sloi a0, a0, 1
-; RV64IB-NEXT:    ret
-;
-; RV64IBP-LABEL: sloi_i64:
-; RV64IBP:       # %bb.0:
-; RV64IBP-NEXT:    sloi a0, a0, 1
-; RV64IBP-NEXT:    ret
-  %neg = shl i64 %a, 1
-  %neg12 = or i64 %neg, 1
-  ret i64 %neg12
-}
-
-define signext i32 @sroi_i32(i32 signext %a) nounwind {
-; RV64I-LABEL: sroi_i32:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    srli a0, a0, 1
-; RV64I-NEXT:    lui a1, 524288
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    ret
-;
-; RV64IB-LABEL: sroi_i32:
-; RV64IB:       # %bb.0:
-; RV64IB-NEXT:    sroiw a0, a0, 1
-; RV64IB-NEXT:    ret
-;
-; RV64IBP-LABEL: sroi_i32:
-; RV64IBP:       # %bb.0:
-; RV64IBP-NEXT:    sroiw a0, a0, 1
-; RV64IBP-NEXT:    ret
-  %neg = lshr i32 %a, 1
-  %neg12 = or i32 %neg, -2147483648
-  ret i32 %neg12
-}
-
-; This is similar to the type legalized version of sroiw but the mask is 0 in
-; the upper bits instead of 1 so the result is not sign extended. Make sure we
-; don't match it to sroiw.
-define i64 @sroiw_bug(i64 %a) nounwind {
-; RV64I-LABEL: sroiw_bug:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    srli a0, a0, 1
-; RV64I-NEXT:    addi a1, zero, 1
-; RV64I-NEXT:    slli a1, a1, 31
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    ret
-;
-; RV64IB-LABEL: sroiw_bug:
-; RV64IB:       # %bb.0:
-; RV64IB-NEXT:    srli a0, a0, 1
-; RV64IB-NEXT:    bseti a0, a0, 31
-; RV64IB-NEXT:    ret
-;
-; RV64IBP-LABEL: sroiw_bug:
-; RV64IBP:       # %bb.0:
-; RV64IBP-NEXT:    srli a0, a0, 1
-; RV64IBP-NEXT:    addi a1, zero, 1
-; RV64IBP-NEXT:    slli a1, a1, 31
-; RV64IBP-NEXT:    or a0, a0, a1
-; RV64IBP-NEXT:    ret
-  %neg = lshr i64 %a, 1
-  %neg12 = or i64 %neg, 2147483648
-  ret i64 %neg12
-}
-
-define i64 @sroi_i64(i64 %a) nounwind {
-; RV64I-LABEL: sroi_i64:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    srli a0, a0, 1
-; RV64I-NEXT:    addi a1, zero, -1
-; RV64I-NEXT:    slli a1, a1, 63
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    ret
-;
-; RV64IB-LABEL: sroi_i64:
-; RV64IB:       # %bb.0:
-; RV64IB-NEXT:    sroi a0, a0, 1
-; RV64IB-NEXT:    ret
-;
-; RV64IBP-LABEL: sroi_i64:
-; RV64IBP:       # %bb.0:
-; RV64IBP-NEXT:    sroi a0, a0, 1
-; RV64IBP-NEXT:    ret
-  %neg = lshr i64 %a, 1
-  %neg12 = or i64 %neg, -9223372036854775808
-  ret i64 %neg12
-}
-
 define signext i32 @gorc1_i32(i32 signext %a) nounwind {
 ; RV64I-LABEL: gorc1_i32:
 ; RV64I:       # %bb.0:
diff --git a/llvm/test/MC/RISCV/rv32zbp-invalid.s b/llvm/test/MC/RISCV/rv32zbp-invalid.s
index 6de719a250dd..11e7e8338377 100644
--- a/llvm/test/MC/RISCV/rv32zbp-invalid.s
+++ b/llvm/test/MC/RISCV/rv32zbp-invalid.s
@@ -1,19 +1,5 @@
 # RUN: not llvm-mc -triple riscv32 -mattr=+experimental-b,experimental-zbp < %s 2>&1 | FileCheck %s
 
-# Too few operands
-slo t0, t1 # CHECK: :[[@LINE]]:1: error: too few operands for instruction
-# Too few operands
-sro t0, t1 # CHECK: :[[@LINE]]:1: error: too few operands for instruction
-# Too few operands
-sloi t0, t1 # CHECK: :[[@LINE]]:1: error: too few operands for instruction
-# Immediate operand out of range
-sloi t0, t1, 32 # CHECK: :[[@LINE]]:14: error: immediate must be an integer in the range [0, 31]
-sloi t0, t1, -1 # CHECK: :[[@LINE]]:14: error: immediate must be an integer in the range [0, 31]
-# Too few operands
-sroi t0, t1 # CHECK: :[[@LINE]]:1: error: too few operands for instruction
-# Immediate operand out of range
-sroi t0, t1, 32 # CHECK: :[[@LINE]]:14: error: immediate must be an integer in the range [0, 31]
-sroi t0, t1, -1 # CHECK: :[[@LINE]]:14: error: immediate must be an integer in the range [0, 31]
 # Too few operands
 gorc t0, t1 # CHECK: :[[@LINE]]:1: error: too few operands for instruction
 # Too few operands
@@ -54,10 +40,6 @@ xperm.n t0, t1 # CHECK: :[[@LINE]]:1: error: too few operands for instruction
 xperm.b t0, t1 # CHECK: :[[@LINE]]:1: error: too few operands for instruction
 # Too few operands
 xperm.h t0, t1 # CHECK: :[[@LINE]]:1: error: too few operands for instruction
-slow t0, t1, t2 # CHECK: :[[@LINE]]:1: error: instruction requires the following: RV64I Base Instruction Set
-srow t0, t1, t2 # CHECK: :[[@LINE]]:1: error: instruction requires the following: RV64I Base Instruction Set
-sloiw t0, t1, 0 # CHECK: :[[@LINE]]:1: error: instruction requires the following: RV64I Base Instruction Set
-sroiw t0, t1, 0 # CHECK: :[[@LINE]]:1: error: instruction requires the following: RV64I Base Instruction Set
 gorcw t0, t1, t2 # CHECK: :[[@LINE]]:1: error: instruction requires the following: RV64I Base Instruction Set
 grevw t0, t1, t2 # CHECK: :[[@LINE]]:1: error: instruction requires the following: RV64I Base Instruction Set
 gorciw t0, t1, 0 # CHECK: :[[@LINE]]:1: error: instruction requires the following: RV64I Base Instruction Set
diff --git a/llvm/test/MC/RISCV/rv32zbp-valid.s b/llvm/test/MC/RISCV/rv32zbp-valid.s
index 2e531343912b..0d7fc033c6a8 100644
--- a/llvm/test/MC/RISCV/rv32zbp-valid.s
+++ b/llvm/test/MC/RISCV/rv32zbp-valid.s
@@ -22,18 +22,6 @@
 # RUN:     | llvm-objdump --mattr=+experimental-zbp -d -r - \
 # RUN:     | FileCheck --check-prefixes=CHECK-OBJ,CHECK-ASM-AND-OBJ %s
 
-# CHECK-ASM-AND-OBJ: slo t0, t1, t2
-# CHECK-ASM: encoding: [0xb3,0x12,0x73,0x20]
-slo t0, t1, t2
-# CHECK-ASM-AND-OBJ: sro t0, t1, t2
-# CHECK-ASM: encoding: [0xb3,0x52,0x73,0x20]
-sro t0, t1, t2
-# CHECK-ASM-AND-OBJ: sloi t0, t1, 0
-# CHECK-ASM: encoding: [0x93,0x12,0x03,0x20]
-sloi t0, t1, 0
-# CHECK-ASM-AND-OBJ: sroi t0, t1, 0
-# CHECK-ASM: encoding: [0x93,0x52,0x03,0x20]
-sroi t0, t1, 0
 # CHECK-ASM-AND-OBJ: gorc t0, t1, t2
 # CHECK-ASM: encoding: [0xb3,0x52,0x73,0x28]
 gorc t0, t1, t2
diff --git a/llvm/test/MC/RISCV/rv64zbp-invalid.s b/llvm/test/MC/RISCV/rv64zbp-invalid.s
index 88adf2d47779..d5b37b2f8dab 100644
--- a/llvm/test/MC/RISCV/rv64zbp-invalid.s
+++ b/llvm/test/MC/RISCV/rv64zbp-invalid.s
@@ -1,19 +1,5 @@
 # RUN: not llvm-mc -triple riscv64 -mattr=+experimental-b,experimental-zbp < %s 2>&1 | FileCheck %s
 
-# Too few operands
-slow t0, t1 # CHECK: :[[@LINE]]:1: error: too few operands for instruction
-# Too few operands
-srow t0, t1 # CHECK: :[[@LINE]]:1: error: too few operands for instruction
-# Too few operands
-sloiw t0, t1 # CHECK: :[[@LINE]]:1: error: too few operands for instruction
-# Immediate operand out of range
-sloiw t0, t1, 32 # CHECK: :[[@LINE]]:15: error: immediate must be an integer in the range [0, 31]
-sloiw t0, t1, -1 # CHECK: :[[@LINE]]:15: error: immediate must be an integer in the range [0, 31]
-# Too few operands
-sroiw t0, t1 # CHECK: :[[@LINE]]:1: error: too few operands for instruction
-# Immediate operand out of range
-sroiw t0, t1, 32 # CHECK: :[[@LINE]]:15: error: immediate must be an integer in the range [0, 31]
-sroiw t0, t1, -1 # CHECK: :[[@LINE]]:15: error: immediate must be an integer in the range [0, 31]
 # Too few operands
 gorcw t0, t1 # CHECK: :[[@LINE]]:1: error: too few operands for instruction
 # Too few operands
diff --git a/llvm/test/MC/RISCV/rv64zbp-valid.s b/llvm/test/MC/RISCV/rv64zbp-valid.s
index 300464da325b..6647927c7150 100644
--- a/llvm/test/MC/RISCV/rv64zbp-valid.s
+++ b/llvm/test/MC/RISCV/rv64zbp-valid.s
@@ -12,18 +12,6 @@
 # RUN:     | llvm-objdump --mattr=+experimental-zbp -d -r - \
 # RUN:     | FileCheck --check-prefixes=CHECK-OBJ,CHECK-ASM-AND-OBJ %s
 
-# CHECK-ASM-AND-OBJ: slow t0, t1, t2
-# CHECK-ASM: encoding: [0xbb,0x12,0x73,0x20]
-slow t0, t1, t2
-# CHECK-ASM-AND-OBJ: srow t0, t1, t2
-# CHECK-ASM: encoding: [0xbb,0x52,0x73,0x20]
-srow t0, t1, t2
-# CHECK-ASM-AND-OBJ: sloiw t0, t1, 0
-# CHECK-ASM: encoding: [0x9b,0x12,0x03,0x20]
-sloiw t0, t1, 0
-# CHECK-ASM-AND-OBJ: sroiw t0, t1, 0
-# CHECK-ASM: encoding: [0x9b,0x52,0x03,0x20]
-sroiw t0, t1, 0
 # CHECK-ASM-AND-OBJ: gorcw t0, t1, t2
 # CHECK-ASM: encoding: [0xbb,0x52,0x73,0x28]
 gorcw t0, t1, t2

From 6aff13f9b05df8ab2ecebf6479204b17d8d8eed6 Mon Sep 17 00:00:00 2001
From: "Wang, Pengfei" <pengfei.wang@intel.com>
Date: Tue, 9 Feb 2021 21:12:59 +0800
Subject: [PATCH 106/318] [X86] Always assign reassoc flag for intrinsics
 *reduce_add/mul_ps/pd.

Intrinsics *reduce_add/mul_ps/pd have assumption that the elements in
the vector are reassociable. So we need to always assign the reassoc
flag when we call _mm_reduce_* intrinsics.

Reviewed By: spatel

Differential Revision: https://reviews.llvm.org/D96231

(cherry picked from commit dd2460ed5d77d908327ce29a15630cd3268bd76e)
---
 clang/lib/CodeGen/CGBuiltin.cpp              |  2 +
 clang/lib/Headers/avx512fintrin.h            | 16 +++--
 clang/test/CodeGen/X86/avx512-reduceIntrin.c | 68 +++++++++++---------
 3 files changed, 49 insertions(+), 37 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 113541bd5024..10e3820d9657 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -13794,12 +13794,14 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_reduce_fadd_ps512: {
     Function *F =
         CGM.getIntrinsic(Intrinsic::vector_reduce_fadd, Ops[1]->getType());
+    Builder.getFastMathFlags().setAllowReassoc(true);
     return Builder.CreateCall(F, {Ops[0], Ops[1]});
   }
   case X86::BI__builtin_ia32_reduce_fmul_pd512:
   case X86::BI__builtin_ia32_reduce_fmul_ps512: {
     Function *F =
         CGM.getIntrinsic(Intrinsic::vector_reduce_fmul, Ops[1]->getType());
+    Builder.getFastMathFlags().setAllowReassoc(true);
     return Builder.CreateCall(F, {Ops[0], Ops[1]});
   }
   case X86::BI__builtin_ia32_reduce_mul_d512:
diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h
index 2ee4350b14d4..f226382cbb2c 100644
--- a/clang/lib/Headers/avx512fintrin.h
+++ b/clang/lib/Headers/avx512fintrin.h
@@ -9297,9 +9297,12 @@ _mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A)
 
 /* Vector-reduction arithmetic accepts vectors as inputs and produces scalars as
  * outputs. This class of vector operation forms the basis of many scientific
- * computations. In vector-reduction arithmetic, the evaluation off is
+ * computations. In vector-reduction arithmetic, the evaluation order is
  * independent of the order of the input elements of V.
 
+ * For floating point types, we always assume the elements are reassociable even
+ * if -fast-math is off.
+
  * Used bisection method. At each step, we partition the vector with previous
  * step in half, and the operation is performed on its two halves.
  * This takes log2(n) steps where n is the number of elements in the vector.
@@ -9345,8 +9348,11 @@ _mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) {
   return __builtin_ia32_reduce_or_q512(__W);
 }
 
+// -0.0 is used to ignore the start value since it is the neutral value of
+// floating point addition. For more information, please refer to
+// https://llvm.org/docs/LangRef.html#llvm-vector-reduce-fadd-intrinsic
 static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_add_pd(__m512d __W) {
-  return __builtin_ia32_reduce_fadd_pd512(0.0, __W);
+  return __builtin_ia32_reduce_fadd_pd512(-0.0, __W);
 }
 
 static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_pd(__m512d __W) {
@@ -9356,7 +9362,7 @@ static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_pd(__m512d __W)
 static __inline__ double __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) {
   __W = _mm512_maskz_mov_pd(__M, __W);
-  return __builtin_ia32_reduce_fadd_pd512(0.0, __W);
+  return __builtin_ia32_reduce_fadd_pd512(-0.0, __W);
 }
 
 static __inline__ double __DEFAULT_FN_ATTRS512
@@ -9411,7 +9417,7 @@ _mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) {
 
 static __inline__ float __DEFAULT_FN_ATTRS512
 _mm512_reduce_add_ps(__m512 __W) {
-  return __builtin_ia32_reduce_fadd_ps512(0.0f, __W);
+  return __builtin_ia32_reduce_fadd_ps512(-0.0f, __W);
 }
 
 static __inline__ float __DEFAULT_FN_ATTRS512
@@ -9422,7 +9428,7 @@ _mm512_reduce_mul_ps(__m512 __W) {
 static __inline__ float __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) {
   __W = _mm512_maskz_mov_ps(__M, __W);
-  return __builtin_ia32_reduce_fadd_ps512(0.0f, __W);
+  return __builtin_ia32_reduce_fadd_ps512(-0.0f, __W);
 }
 
 static __inline__ float __DEFAULT_FN_ATTRS512
diff --git a/clang/test/CodeGen/X86/avx512-reduceIntrin.c b/clang/test/CodeGen/X86/avx512-reduceIntrin.c
index d8a1130f3cef..62580ca1914e 100644
--- a/clang/test/CodeGen/X86/avx512-reduceIntrin.c
+++ b/clang/test/CodeGen/X86/avx512-reduceIntrin.c
@@ -11,13 +11,13 @@ long long test_mm512_reduce_add_epi64(__m512i __W){
 long long test_mm512_reduce_mul_epi64(__m512i __W){
 // CHECK-LABEL: @test_mm512_reduce_mul_epi64(
 // CHECK:    call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> %{{.*}})
-  return _mm512_reduce_mul_epi64(__W); 
+  return _mm512_reduce_mul_epi64(__W);
 }
 
 long long test_mm512_reduce_or_epi64(__m512i __W){
 // CHECK-LABEL: @test_mm512_reduce_or_epi64(
 // CHECK:    call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %{{.*}})
-  return _mm512_reduce_or_epi64(__W); 
+  return _mm512_reduce_or_epi64(__W);
 }
 
 long long test_mm512_reduce_and_epi64(__m512i __W){
@@ -31,7 +31,7 @@ long long test_mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W){
 // CHECK:    bitcast i8 %{{.*}} to <8 x i1>
 // CHECK:    select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
 // CHECK:    call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %{{.*}})
-  return _mm512_mask_reduce_add_epi64(__M, __W); 
+  return _mm512_mask_reduce_add_epi64(__M, __W);
 }
 
 long long test_mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W){
@@ -39,7 +39,7 @@ long long test_mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W){
 // CHECK:    bitcast i8 %{{.*}} to <8 x i1>
 // CHECK:    select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
 // CHECK:    call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> %{{.*}})
-  return _mm512_mask_reduce_mul_epi64(__M, __W); 
+  return _mm512_mask_reduce_mul_epi64(__M, __W);
 }
 
 long long test_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W){
@@ -47,7 +47,7 @@ long long test_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W){
 // CHECK:    bitcast i8 %{{.*}} to <8 x i1>
 // CHECK:    select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
 // CHECK:    call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> %{{.*}})
-  return _mm512_mask_reduce_and_epi64(__M, __W); 
+  return _mm512_mask_reduce_and_epi64(__M, __W);
 }
 
 long long test_mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W){
@@ -55,30 +55,30 @@ long long test_mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W){
 // CHECK:    bitcast i8 %{{.*}} to <8 x i1>
 // CHECK:    select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
 // CHECK:    call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %{{.*}})
-  return _mm512_mask_reduce_or_epi64(__M, __W); 
+  return _mm512_mask_reduce_or_epi64(__M, __W);
 }
 
 int test_mm512_reduce_add_epi32(__m512i __W){
 // CHECK-LABEL: @test_mm512_reduce_add_epi32(
 // CHECK:    call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %{{.*}})
-  return _mm512_reduce_add_epi32(__W); 
+  return _mm512_reduce_add_epi32(__W);
 }
 
 int test_mm512_reduce_mul_epi32(__m512i __W){
 // CHECK-LABEL: @test_mm512_reduce_mul_epi32(
 // CHECK:    call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> %{{.*}})
-  return _mm512_reduce_mul_epi32(__W); 
+  return _mm512_reduce_mul_epi32(__W);
 }
 
 int test_mm512_reduce_or_epi32(__m512i __W){
 // CHECK:    call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %{{.*}})
-  return _mm512_reduce_or_epi32(__W); 
+  return _mm512_reduce_or_epi32(__W);
 }
 
 int test_mm512_reduce_and_epi32(__m512i __W){
 // CHECK-LABEL: @test_mm512_reduce_and_epi32(
 // CHECK:    call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %{{.*}})
-  return _mm512_reduce_and_epi32(__W); 
+  return _mm512_reduce_and_epi32(__W);
 }
 
 int test_mm512_mask_reduce_add_epi32(__mmask16 __M, __m512i __W){
@@ -86,7 +86,7 @@ int test_mm512_mask_reduce_add_epi32(__mmask16 __M, __m512i __W){
 // CHECK:    bitcast i16 %{{.*}} to <16 x i1>
 // CHECK:    select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
 // CHECK:    call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %{{.*}})
-  return _mm512_mask_reduce_add_epi32(__M, __W); 
+  return _mm512_mask_reduce_add_epi32(__M, __W);
 }
 
 int test_mm512_mask_reduce_mul_epi32(__mmask16 __M, __m512i __W){
@@ -94,7 +94,7 @@ int test_mm512_mask_reduce_mul_epi32(__mmask16 __M, __m512i __W){
 // CHECK:    bitcast i16 %{{.*}} to <16 x i1>
 // CHECK:    select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
 // CHECK:    call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> %{{.*}})
-  return _mm512_mask_reduce_mul_epi32(__M, __W); 
+  return _mm512_mask_reduce_mul_epi32(__M, __W);
 }
 
 int test_mm512_mask_reduce_and_epi32(__mmask16 __M, __m512i __W){
@@ -102,7 +102,7 @@ int test_mm512_mask_reduce_and_epi32(__mmask16 __M, __m512i __W){
 // CHECK:    bitcast i16 %{{.*}} to <16 x i1>
 // CHECK:    select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
 // CHECK:    call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %{{.*}})
-  return _mm512_mask_reduce_and_epi32(__M, __W); 
+  return _mm512_mask_reduce_and_epi32(__M, __W);
 }
 
 int test_mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W){
@@ -110,61 +110,65 @@ int test_mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W){
 // CHECK:    bitcast i16 %{{.*}} to <16 x i1>
 // CHECK:    select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
 // CHECK:    call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %{{.*}})
-  return _mm512_mask_reduce_or_epi32(__M, __W); 
+  return _mm512_mask_reduce_or_epi32(__M, __W);
 }
 
-double test_mm512_reduce_add_pd(__m512d __W){
+double test_mm512_reduce_add_pd(__m512d __W, double ExtraAddOp){
 // CHECK-LABEL: @test_mm512_reduce_add_pd(
-// CHECK:    call double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> %{{.*}})
-  return _mm512_reduce_add_pd(__W); 
+// CHECK-NOT: reassoc
+// CHECK:    call reassoc double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> %{{.*}})
+// CHECK-NOT: reassoc
+  return _mm512_reduce_add_pd(__W) + ExtraAddOp;
 }
 
-double test_mm512_reduce_mul_pd(__m512d __W){
+double test_mm512_reduce_mul_pd(__m512d __W, double ExtraMulOp){
 // CHECK-LABEL: @test_mm512_reduce_mul_pd(
-// CHECK:    call double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %{{.*}})
-  return _mm512_reduce_mul_pd(__W); 
+// CHECK-NOT: reassoc
+// CHECK:    call reassoc double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %{{.*}})
+// CHECK-NOT: reassoc
+  return _mm512_reduce_mul_pd(__W) * ExtraMulOp;
 }
 
 float test_mm512_reduce_add_ps(__m512 __W){
 // CHECK-LABEL: @test_mm512_reduce_add_ps(
-// CHECK:    call float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> %{{.*}})
-  return _mm512_reduce_add_ps(__W); 
+// CHECK:    call reassoc float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> %{{.*}})
+  return _mm512_reduce_add_ps(__W);
 }
 
 float test_mm512_reduce_mul_ps(__m512 __W){
 // CHECK-LABEL: @test_mm512_reduce_mul_ps(
-// CHECK:    call float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> %{{.*}})
-  return _mm512_reduce_mul_ps(__W); 
+// CHECK:    call reassoc float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> %{{.*}})
+  return _mm512_reduce_mul_ps(__W);
 }
 
 double test_mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W){
 // CHECK-LABEL: @test_mm512_mask_reduce_add_pd(
 // CHECK:    bitcast i8 %{{.*}} to <8 x i1>
 // CHECK:    select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-// CHECK:    call double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> %{{.*}})
-  return _mm512_mask_reduce_add_pd(__M, __W); 
+// CHECK:    call reassoc double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> %{{.*}})
+  return _mm512_mask_reduce_add_pd(__M, __W);
 }
 
 double test_mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W){
 // CHECK-LABEL: @test_mm512_mask_reduce_mul_pd(
 // CHECK:    bitcast i8 %{{.*}} to <8 x i1>
 // CHECK:    select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-// CHECK:    call double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %{{.*}})
-  return _mm512_mask_reduce_mul_pd(__M, __W); 
+// CHECK:    call reassoc double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %{{.*}})
+  return _mm512_mask_reduce_mul_pd(__M, __W);
 }
 
 float test_mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W){
 // CHECK-LABEL: @test_mm512_mask_reduce_add_ps(
 // CHECK:    bitcast i16 %{{.*}} to <16 x i1>
 // CHECK:    select <16 x i1> %{{.*}}, <16 x float> {{.*}}, <16 x float> {{.*}}
-// CHECK:    call float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> %{{.*}})
-  return _mm512_mask_reduce_add_ps(__M, __W); 
+// CHECK:    call reassoc float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> %{{.*}})
+  return _mm512_mask_reduce_add_ps(__M, __W);
 }
 
 float test_mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W){
 // CHECK-LABEL: @test_mm512_mask_reduce_mul_ps(
 // CHECK:    bitcast i16 %{{.*}} to <16 x i1>
 // CHECK:    select <16 x i1> %{{.*}}, <16 x float> {{.*}}, <16 x float> %{{.*}}
-// CHECK:    call float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> %{{.*}})
-  return _mm512_mask_reduce_mul_ps(__M, __W); 
+// CHECK:    call reassoc float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> %{{.*}})
+  return _mm512_mask_reduce_mul_ps(__M, __W);
 }

From 23a40f7a595d5f07cb08d1d28987d7b4ca3ed766 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 8 Feb 2021 13:31:05 -0800
Subject: [PATCH 107/318] [Verifier] Allow DW_TAG_class_type/DW_TAG_union_type
 to have no filename

`clang/lib/CodeGen/CGOpenMPRuntime.cpp` synthesized union
(`distinct !DICompositeType(tag: DW_TAG_union_type, name: "kmp_cmplrdata_t", size: 64, elements: <0x62b690>)`)
does not have meaningful filename/line number.

D94735 dropped the previously arbitrary and untested filename/line from the union and caused a verifier error here.

This fixes `check-libarcher` failures.

Differential Revision: https://reviews.llvm.org/D96212

(cherry picked from commit ad60802a7187aa39b0374536be3fa176fe3d6256)
---
 llvm/lib/IR/Verifier.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 100e881c8fa8..6dd299ee9845 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -1070,12 +1070,6 @@ void Verifier::visitDICompositeType(const DICompositeType &N) {
   if (auto *Params = N.getRawTemplateParams())
     visitTemplateParams(N, *Params);
 
-  if (N.getTag() == dwarf::DW_TAG_class_type ||
-      N.getTag() == dwarf::DW_TAG_union_type) {
-    AssertDI(N.getFile() && !N.getFile()->getFilename().empty(),
-             "class/union requires a filename", &N, N.getFile());
-  }
-
   if (auto *D = N.getRawDiscriminator()) {
     AssertDI(isa<DIDerivedType>(D) && N.getTag() == dwarf::DW_TAG_variant_part,
              "discriminator can only appear on variant part");

From 0d6859eb70c08704e71c844ea94c987f7b90f8d0 Mon Sep 17 00:00:00 2001
From: Qiu Chaofan <qiucofan@cn.ibm.com>
Date: Fri, 5 Feb 2021 20:33:56 +0800
Subject: [PATCH 108/318] Revert "[PowerPC] [Clang] Enable float128 feature on
 P9 by default"

Commit 6bf29dbb enables float128 feature by default for Power9 targets.
But float128 may cause build failure in libcxx testing. Revert this
commit first to unblock LLVM 12 release.

(cherry picked from commit 447dc856b243b99ce70019ba1187c39746f4e0e9)
---
 clang/lib/Basic/Targets/PPC.cpp            | 3 ---
 clang/test/Driver/ppc-f128-support-check.c | 4 ++--
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Basic/Targets/PPC.cpp b/clang/lib/Basic/Targets/PPC.cpp
index cfede6e6e756..ff09c0fa2a23 100644
--- a/clang/lib/Basic/Targets/PPC.cpp
+++ b/clang/lib/Basic/Targets/PPC.cpp
@@ -318,9 +318,6 @@ bool PPCTargetInfo::initFeatureMap(
                         .Case("pwr9", true)
                         .Case("pwr8", true)
                         .Default(false);
-  Features["float128"] = llvm::StringSwitch<bool>(CPU)
-                        .Case("pwr9", true)
-                        .Default(false);
 
   Features["spe"] = llvm::StringSwitch<bool>(CPU)
                         .Case("8548", true)
diff --git a/clang/test/Driver/ppc-f128-support-check.c b/clang/test/Driver/ppc-f128-support-check.c
index 2e4b7a7ae09c..24748905612f 100644
--- a/clang/test/Driver/ppc-f128-support-check.c
+++ b/clang/test/Driver/ppc-f128-support-check.c
@@ -1,7 +1,7 @@
 // RUN: not %clang -target powerpc64le-unknown-linux-gnu -fsyntax-only \
-// RUN:   -mcpu=pwr9 %s 2>&1 | FileCheck %s --check-prefix=HASF128
+// RUN:   -mcpu=pwr9 -mfloat128 %s 2>&1 | FileCheck %s --check-prefix=HASF128
 // RUN: not %clang -target powerpc64le-unknown-linux-gnu -fsyntax-only \
-// RUN:   -mcpu=power9 %s 2>&1 | FileCheck %s --check-prefix=HASF128
+// RUN:   -mcpu=power9 -mfloat128 %s 2>&1 | FileCheck %s --check-prefix=HASF128
 
 // RUN: not %clang -target powerpc64le-unknown-linux-gnu -fsyntax-only \
 // RUN:   -mcpu=pwr8 -mfloat128 %s 2>&1 | FileCheck %s --check-prefix=NOF128

From 075e2629b02d194313a069ce1ee9a8d40c7bd66f Mon Sep 17 00:00:00 2001
From: Stephen Kelly <steveire@gmail.com>
Date: Wed, 27 Jan 2021 23:47:05 +0000
Subject: [PATCH 109/318] [ASTMatchers] Avoid pathological traversal over
 nested lambdas

Differential Revision: https://reviews.llvm.org/D95573

(cherry picked from commit 6f0df3cddb3e3f38df1baa7aa4d743a74bb46688)
---
 clang/include/clang/AST/RecursiveASTVisitor.h | 11 +++
 clang/lib/ASTMatchers/ASTMatchFinder.cpp      |  8 ++-
 .../ASTMatchers/ASTMatchersTraversalTest.cpp  | 72 +++++++++++++++++++
 3 files changed, 89 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index 505ea700fd0e..db2ef21f4364 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -186,6 +186,9 @@ template <typename Derived> class RecursiveASTVisitor {
   /// code, e.g., implicit constructors and destructors.
   bool shouldVisitImplicitCode() const { return false; }
 
+  /// Return whether this visitor should recurse into lambda body
+  bool shouldVisitLambdaBody() const { return true; }
+
   /// Return whether this visitor should traverse post-order.
   bool shouldTraversePostOrder() const { return false; }
 
@@ -2057,6 +2060,14 @@ bool RecursiveASTVisitor<Derived>::TraverseFunctionHelper(FunctionDecl *D) {
       // by clang.
       (!D->isDefaulted() || getDerived().shouldVisitImplicitCode());
 
+  if (const auto *MD = dyn_cast<CXXMethodDecl>(D)) {
+    if (const CXXRecordDecl *RD = MD->getParent()) {
+      if (RD->isLambda()) {
+        VisitBody = VisitBody && getDerived().shouldVisitLambdaBody();
+      }
+    }
+  }
+
   if (VisitBody) {
     TRY_TO(TraverseStmt(D->getBody())); // Function body.
   }
diff --git a/clang/lib/ASTMatchers/ASTMatchFinder.cpp b/clang/lib/ASTMatchers/ASTMatchFinder.cpp
index 8ddd3c87e09d..5034203840fc 100644
--- a/clang/lib/ASTMatchers/ASTMatchFinder.cpp
+++ b/clang/lib/ASTMatchers/ASTMatchFinder.cpp
@@ -556,9 +556,9 @@ class MatchASTVisitor : public RecursiveASTVisitor<MatchASTVisitor>,
         if (LE->hasExplicitResultType())
           TraverseTypeLoc(Proto.getReturnLoc());
         TraverseStmt(LE->getTrailingRequiresClause());
-
-        TraverseStmt(LE->getBody());
       }
+
+      TraverseStmt(LE->getBody());
       return true;
     }
     return RecursiveASTVisitor<MatchASTVisitor>::dataTraverseNode(S, Queue);
@@ -697,6 +697,10 @@ class MatchASTVisitor : public RecursiveASTVisitor<MatchASTVisitor>,
   bool shouldVisitTemplateInstantiations() const { return true; }
   bool shouldVisitImplicitCode() const { return true; }
 
+  // We visit the lambda body explicitly, so instruct the RAV
+  // to not visit it on our behalf too.
+  bool shouldVisitLambdaBody() const { return false; }
+
   bool IsMatchingInASTNodeNotSpelledInSource() const override {
     return TraversingASTNodeNotSpelledInSource;
   }
diff --git a/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp b/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
index 92bf244b0e4a..8004599e01a2 100644
--- a/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
+++ b/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
@@ -3853,6 +3853,78 @@ void binop()
   }
 }
 
+TEST(IgnoringImpCasts, PathologicalLambda) {
+
+  // Test that deeply nested lambdas are not a performance penalty
+  StringRef Code = R"cpp(
+void f() {
+  [] {
+  [] {
+  [] {
+  [] {
+  [] {
+  [] {
+  [] {
+  [] {
+  [] {
+  [] {
+  [] {
+  [] {
+  [] {
+  [] {
+  [] {
+  [] {
+  [] {
+  [] {
+  [] {
+  [] {
+  [] {
+  [] {
+  [] {
+  [] {
+  [] {
+  [] {
+  [] {
+  [] {
+  [] {
+    int i = 42;
+    (void)i;
+  }();
+  }();
+  }();
+  }();
+  }();
+  }();
+  }();
+  }();
+  }();
+  }();
+  }();
+  }();
+  }();
+  }();
+  }();
+  }();
+  }();
+  }();
+  }();
+  }();
+  }();
+  }();
+  }();
+  }();
+  }();
+  }();
+  }();
+  }();
+  }();
+}
+  )cpp";
+
+  EXPECT_TRUE(matches(Code, integerLiteral(equals(42))));
+  EXPECT_TRUE(matches(Code, functionDecl(hasDescendant(integerLiteral(equals(42))))));
+}
+
 TEST(IgnoringImpCasts, MatchesImpCasts) {
   // This test checks that ignoringImpCasts matches when implicit casts are
   // present and its inner matcher alone does not match.

From 94607512357da5c727d210cc34e642156429d19c Mon Sep 17 00:00:00 2001
From: Stephen Kelly <steveire@gmail.com>
Date: Thu, 28 Jan 2021 23:40:16 +0000
Subject: [PATCH 110/318] Ensure that we traverse non-op() method bodys of
 lambdas

Differential Revision: https://reviews.llvm.org/D95644

(cherry picked from commit 43cc4f15008f8c700497d3d2b7020bfd29f5750f)
---
 clang/include/clang/AST/RecursiveASTVisitor.h |  3 +-
 .../ASTMatchers/ASTMatchersTraversalTest.cpp  | 36 +++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index db2ef21f4364..7870cea198a7 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -2062,7 +2062,8 @@ bool RecursiveASTVisitor<Derived>::TraverseFunctionHelper(FunctionDecl *D) {
 
   if (const auto *MD = dyn_cast<CXXMethodDecl>(D)) {
     if (const CXXRecordDecl *RD = MD->getParent()) {
-      if (RD->isLambda()) {
+      if (RD->isLambda() &&
+          declaresSameEntity(RD->getLambdaCallOperator(), MD)) {
         VisitBody = VisitBody && getDerived().shouldVisitLambdaBody();
       }
     }
diff --git a/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp b/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
index 8004599e01a2..a3a09c426673 100644
--- a/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
+++ b/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
@@ -474,6 +474,42 @@ TEST(Matcher, CapturesThis) {
   EXPECT_TRUE(notMatches("void f() { int z = 3; [&z](){}; }", HasCaptureThis));
 }
 
+TEST(Matcher, MatchesMethodsOnLambda) {
+  StringRef Code = R"cpp(
+struct A {
+  ~A() {}
+};
+void foo()
+{
+  A a;
+  auto l = [a] { };
+  auto lCopy = l;
+  auto lPtrDecay = +[] { };
+  (void)lPtrDecay;
+}
+)cpp";
+
+  EXPECT_TRUE(matches(
+      Code, cxxConstructorDecl(
+                hasBody(compoundStmt()),
+                hasAncestor(lambdaExpr(hasAncestor(varDecl(hasName("l"))))),
+                isCopyConstructor())));
+  EXPECT_TRUE(matches(
+      Code, cxxConstructorDecl(
+                hasBody(compoundStmt()),
+                hasAncestor(lambdaExpr(hasAncestor(varDecl(hasName("l"))))),
+                isMoveConstructor())));
+  EXPECT_TRUE(matches(
+      Code, cxxDestructorDecl(
+                hasBody(compoundStmt()),
+                hasAncestor(lambdaExpr(hasAncestor(varDecl(hasName("l"))))))));
+  EXPECT_TRUE(matches(
+      Code, cxxConversionDecl(hasBody(compoundStmt(has(returnStmt(
+                                  hasReturnValue(implicitCastExpr()))))),
+                              hasAncestor(lambdaExpr(hasAncestor(
+                                  varDecl(hasName("lPtrDecay"))))))));
+}
+
 TEST(Matcher, isClassMessage) {
   EXPECT_TRUE(matchesObjC(
       "@interface NSString +(NSString *) stringWithFormat; @end "

From 8c24a88dee6426ffa98cd820f1c8f4803bec1d86 Mon Sep 17 00:00:00 2001
From: Stephen Kelly <steveire@gmail.com>
Date: Wed, 27 Jan 2021 22:03:23 +0000
Subject: [PATCH 111/318] [ASTMatchers] Fix traversal below range-for elements

Differential Revision: https://reviews.llvm.org/D95562

(cherry picked from commit 79125085f16540579d27c7e4987f63eef9c4aa23)
---
 clang/lib/ASTMatchers/ASTMatchFinder.cpp      | 30 ++++++---
 .../ASTMatchers/ASTMatchersTraversalTest.cpp  | 64 +++++++++++++++++++
 2 files changed, 84 insertions(+), 10 deletions(-)

diff --git a/clang/lib/ASTMatchers/ASTMatchFinder.cpp b/clang/lib/ASTMatchers/ASTMatchFinder.cpp
index 5034203840fc..89e83ee61574 100644
--- a/clang/lib/ASTMatchers/ASTMatchFinder.cpp
+++ b/clang/lib/ASTMatchers/ASTMatchFinder.cpp
@@ -243,10 +243,14 @@ class MatchChildASTVisitor
       return true;
     ScopedIncrement ScopedDepth(&CurrentDepth);
     if (auto *Init = Node->getInit())
-      if (!match(*Init))
+      if (!traverse(*Init))
         return false;
-    if (!match(*Node->getLoopVariable()) || !match(*Node->getRangeInit()) ||
-        !match(*Node->getBody()))
+    if (!match(*Node->getLoopVariable()))
+      return false;
+    if (match(*Node->getRangeInit()))
+      if (!VisitorBase::TraverseStmt(Node->getRangeInit()))
+        return false;
+    if (!match(*Node->getBody()))
       return false;
     return VisitorBase::TraverseStmt(Node->getBody());
   }
@@ -488,15 +492,21 @@ class MatchASTVisitor : public RecursiveASTVisitor<MatchASTVisitor>,
 
   bool dataTraverseNode(Stmt *S, DataRecursionQueue *Queue) {
     if (auto *RF = dyn_cast<CXXForRangeStmt>(S)) {
-      for (auto *SubStmt : RF->children()) {
-        if (SubStmt == RF->getInit() || SubStmt == RF->getLoopVarStmt() ||
-            SubStmt == RF->getRangeInit() || SubStmt == RF->getBody()) {
-          TraverseStmt(SubStmt, Queue);
-        } else {
-          ASTNodeNotSpelledInSourceScope RAII(this, true);
-          TraverseStmt(SubStmt, Queue);
+      {
+        ASTNodeNotAsIsSourceScope RAII(this, true);
+        TraverseStmt(RF->getInit());
+        // Don't traverse under the loop variable
+        match(*RF->getLoopVariable());
+        TraverseStmt(RF->getRangeInit());
+      }
+      {
+        ASTNodeNotSpelledInSourceScope RAII(this, true);
+        for (auto *SubStmt : RF->children()) {
+          if (SubStmt != RF->getBody())
+            TraverseStmt(SubStmt);
         }
       }
+      TraverseStmt(RF->getBody());
       return true;
     } else if (auto *RBO = dyn_cast<CXXRewrittenBinaryOperator>(S)) {
       {
diff --git a/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp b/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
index a3a09c426673..cbea274cecc9 100644
--- a/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
+++ b/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
@@ -2820,6 +2820,36 @@ struct CtorInitsNonTrivial : NonTrivial
     EXPECT_FALSE(matches(Code, traverse(TK_IgnoreUnlessSpelledInSource, M)));
   }
 
+  Code = R"cpp(
+  struct Range {
+    int* begin() const;
+    int* end() const;
+  };
+  Range getRange(int);
+
+  void rangeFor()
+  {
+    for (auto i : getRange(42))
+    {
+    }
+  }
+  )cpp";
+  {
+    auto M = integerLiteral(equals(42));
+    EXPECT_TRUE(matches(Code, traverse(TK_AsIs, M)));
+    EXPECT_TRUE(matches(Code, traverse(TK_IgnoreUnlessSpelledInSource, M)));
+  }
+  {
+    auto M = callExpr(hasDescendant(integerLiteral(equals(42))));
+    EXPECT_TRUE(matches(Code, traverse(TK_AsIs, M)));
+    EXPECT_TRUE(matches(Code, traverse(TK_IgnoreUnlessSpelledInSource, M)));
+  }
+  {
+    auto M = compoundStmt(hasDescendant(integerLiteral(equals(42))));
+    EXPECT_TRUE(matches(Code, traverse(TK_AsIs, M)));
+    EXPECT_TRUE(matches(Code, traverse(TK_IgnoreUnlessSpelledInSource, M)));
+  }
+
   Code = R"cpp(
   void rangeFor()
   {
@@ -2891,6 +2921,40 @@ struct CtorInitsNonTrivial : NonTrivial
         matchesConditionally(Code, traverse(TK_IgnoreUnlessSpelledInSource, M),
                              true, {"-std=c++20"}));
   }
+
+  Code = R"cpp(
+  struct Range {
+    int* begin() const;
+    int* end() const;
+  };
+  Range getRange(int);
+
+  int getNum(int);
+
+  void rangeFor()
+  {
+    for (auto j = getNum(42); auto i : getRange(j))
+    {
+    }
+  }
+  )cpp";
+  {
+    auto M = integerLiteral(equals(42));
+    EXPECT_TRUE(
+        matchesConditionally(Code, traverse(TK_AsIs, M), true, {"-std=c++20"}));
+    EXPECT_TRUE(
+        matchesConditionally(Code, traverse(TK_IgnoreUnlessSpelledInSource, M),
+                             true, {"-std=c++20"}));
+  }
+  {
+    auto M = compoundStmt(hasDescendant(integerLiteral(equals(42))));
+    EXPECT_TRUE(
+        matchesConditionally(Code, traverse(TK_AsIs, M), true, {"-std=c++20"}));
+    EXPECT_TRUE(
+        matchesConditionally(Code, traverse(TK_IgnoreUnlessSpelledInSource, M),
+                             true, {"-std=c++20"}));
+  }
+
   Code = R"cpp(
 void hasDefaultArg(int i, int j = 0)
 {

From aaf23abe9d57af638644cedbc1ca6132f140a57d Mon Sep 17 00:00:00 2001
From: Stephen Kelly <steveire@gmail.com>
Date: Thu, 28 Jan 2021 13:12:43 +0000
Subject: [PATCH 112/318] Fix traversal with hasDescendant into lambdas

Differential Revision: https://reviews.llvm.org/D95607

(cherry picked from commit bb57a3422a09dcdd572ccb42767a0dabb5f966dd)
---
 clang/lib/ASTMatchers/ASTMatchFinder.cpp          |  2 +-
 .../ASTMatchers/ASTMatchersTraversalTest.cpp      | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/clang/lib/ASTMatchers/ASTMatchFinder.cpp b/clang/lib/ASTMatchers/ASTMatchFinder.cpp
index 89e83ee61574..41be3738e707 100644
--- a/clang/lib/ASTMatchers/ASTMatchFinder.cpp
+++ b/clang/lib/ASTMatchers/ASTMatchFinder.cpp
@@ -295,7 +295,7 @@ class MatchChildASTVisitor
     if (!match(*Node->getBody()))
       return false;
 
-    return true;
+    return VisitorBase::TraverseStmt(Node->getBody());
   }
 
   bool shouldVisitTemplateInstantiations() const { return true; }
diff --git a/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp b/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
index cbea274cecc9..c67c40ed960a 100644
--- a/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
+++ b/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
@@ -3220,6 +3220,12 @@ void func14() {
   float i = 42.0;
 }
 
+void func15() {
+  int count = 0;
+  auto l = [&] { ++count; };
+  (void)l;
+}
+
 )cpp";
 
   EXPECT_TRUE(
@@ -3404,6 +3410,15 @@ void func14() {
                functionDecl(hasName("func14"), hasDescendant(floatLiteral()))),
       langCxx20OrLater()));
 
+  EXPECT_TRUE(matches(
+      Code,
+      traverse(TK_IgnoreUnlessSpelledInSource,
+               compoundStmt(
+                   hasDescendant(varDecl(hasName("count")).bind("countVar")),
+                   hasDescendant(
+                       declRefExpr(to(varDecl(equalsBoundNode("countVar"))))))),
+      langCxx20OrLater()));
+
   Code = R"cpp(
 void foo() {
     int explicit_captured = 0;

From cd25aa9e409e961d64f5fb26bff3882f3a4db2d3 Mon Sep 17 00:00:00 2001
From: Stephen Kelly <steveire@gmail.com>
Date: Sat, 30 Jan 2021 15:46:08 +0000
Subject: [PATCH 113/318] [ASTMatchers] Fix definition of decompositionDecl

(cherry picked from commit b10d445307a0f3c7e5522836b4331090aacaf349)
---
 clang/include/clang/ASTMatchers/ASTMatchers.h | 2 +-
 clang/lib/ASTMatchers/ASTMatchersInternal.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/ASTMatchers/ASTMatchers.h b/clang/include/clang/ASTMatchers/ASTMatchers.h
index 6f6dfab59a39..031fa4682c3a 100644
--- a/clang/include/clang/ASTMatchers/ASTMatchers.h
+++ b/clang/include/clang/ASTMatchers/ASTMatchers.h
@@ -344,7 +344,7 @@ extern const internal::VariadicAllOfMatcher<Decl> decl;
 ///   int number = 42;
 ///   auto [foo, bar] = std::make_pair{42, 42};
 /// \endcode
-extern const internal::VariadicAllOfMatcher<DecompositionDecl>
+extern const internal::VariadicDynCastAllOfMatcher<Decl, DecompositionDecl>
     decompositionDecl;
 
 /// Matches a declaration of a linkage specification.
diff --git a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
index 6c7e14e3499a..705f1cdf3153 100644
--- a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
+++ b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
@@ -732,7 +732,7 @@ const internal::VariadicDynCastAllOfMatcher<Decl, TypeAliasDecl> typeAliasDecl;
 const internal::VariadicDynCastAllOfMatcher<Decl, TypeAliasTemplateDecl>
     typeAliasTemplateDecl;
 const internal::VariadicAllOfMatcher<Decl> decl;
-const internal::VariadicAllOfMatcher<DecompositionDecl> decompositionDecl;
+const internal::VariadicDynCastAllOfMatcher<Decl, DecompositionDecl> decompositionDecl;
 const internal::VariadicDynCastAllOfMatcher<Decl, LinkageSpecDecl>
     linkageSpecDecl;
 const internal::VariadicDynCastAllOfMatcher<Decl, NamedDecl> namedDecl;

From e76f4385c2e1f95781584e0ac12a586dece8223a Mon Sep 17 00:00:00 2001
From: Stephen Kelly <steveire@gmail.com>
Date: Sat, 30 Jan 2021 01:36:40 +0000
Subject: [PATCH 114/318] [ASTMatchers] Fix matching after generic top-level
 matcher

With a matcher like

  expr(anyOf(integerLiteral(equals(42)), unless(expr())))

and code such as

  struct B {
    B(int);
  };

  B func1() { return 42; }

the top-level expr() would match each of the nodes which are not spelled
in the source and then ignore-traverse to match the integerLiteral node.
This would result in multiple results reported for the integerLiteral.

Fix that by only running matching logic on nodes which are not skipped
with the top-level matcher.

Differential Revision: https://reviews.llvm.org/D95735

(cherry picked from commit d6a06365cf12bebe20a7d65cf3894608efc089b4)
---
 clang/lib/ASTMatchers/ASTMatchFinder.cpp      |  8 +++
 .../ASTMatchers/ASTMatchersTraversalTest.cpp  | 72 +++++++++++++++++++
 2 files changed, 80 insertions(+)

diff --git a/clang/lib/ASTMatchers/ASTMatchFinder.cpp b/clang/lib/ASTMatchers/ASTMatchFinder.cpp
index 41be3738e707..69957a952d17 100644
--- a/clang/lib/ASTMatchers/ASTMatchFinder.cpp
+++ b/clang/lib/ASTMatchers/ASTMatchFinder.cpp
@@ -837,6 +837,14 @@ class MatchASTVisitor : public RecursiveASTVisitor<MatchASTVisitor>,
       if (EnableCheckProfiling)
         Timer.setBucket(&TimeByBucket[MP.second->getID()]);
       BoundNodesTreeBuilder Builder;
+
+      {
+        TraversalKindScope RAII(getASTContext(), MP.first.getTraversalKind());
+        if (getASTContext().getParentMapContext().traverseIgnored(DynNode) !=
+            DynNode)
+          continue;
+      }
+
       if (MP.first.matches(DynNode, this, &Builder)) {
         MatchVisitor Visitor(ActiveASTContext, MP.second);
         Builder.visitMatches(&Visitor);
diff --git a/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp b/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
index c67c40ed960a..06c2bbc29e5c 100644
--- a/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
+++ b/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
@@ -2519,6 +2519,78 @@ template<> bool timesTwo<bool>(bool){
     EXPECT_TRUE(matches(Code, traverse(TK_AsIs, M)));
     EXPECT_TRUE(matches(Code, traverse(TK_IgnoreUnlessSpelledInSource, M)));
   }
+
+  Code = R"cpp(
+struct B {
+  B(int);
+};
+
+B func1() { return 42; }
+  )cpp";
+  {
+    auto M = expr(ignoringImplicit(integerLiteral(equals(42)).bind("intLit")));
+    EXPECT_TRUE(matchAndVerifyResultTrue(
+        Code, traverse(TK_AsIs, M),
+        std::make_unique<VerifyIdIsBoundTo<Expr>>("intLit", 1)));
+    EXPECT_TRUE(matchAndVerifyResultTrue(
+        Code, traverse(TK_IgnoreUnlessSpelledInSource, M),
+        std::make_unique<VerifyIdIsBoundTo<Expr>>("intLit", 1)));
+  }
+  {
+    auto M = expr(unless(integerLiteral(equals(24)))).bind("intLit");
+    EXPECT_TRUE(matchAndVerifyResultTrue(
+        Code, traverse(TK_AsIs, M),
+        std::make_unique<VerifyIdIsBoundTo<Expr>>("intLit", 7)));
+    EXPECT_TRUE(matchAndVerifyResultTrue(
+        Code, traverse(TK_IgnoreUnlessSpelledInSource, M),
+        std::make_unique<VerifyIdIsBoundTo<Expr>>("intLit", 1)));
+  }
+  {
+    auto M =
+        expr(anyOf(integerLiteral(equals(42)).bind("intLit"), unless(expr())));
+    EXPECT_TRUE(matchAndVerifyResultTrue(
+        Code, traverse(TK_AsIs, M),
+        std::make_unique<VerifyIdIsBoundTo<Expr>>("intLit", 1)));
+    EXPECT_TRUE(matchAndVerifyResultTrue(
+        Code, traverse(TK_IgnoreUnlessSpelledInSource, M),
+        std::make_unique<VerifyIdIsBoundTo<Expr>>("intLit", 1)));
+  }
+  {
+    auto M = expr(allOf(integerLiteral(equals(42)).bind("intLit"), expr()));
+    EXPECT_TRUE(matchAndVerifyResultTrue(
+        Code, traverse(TK_AsIs, M),
+        std::make_unique<VerifyIdIsBoundTo<Expr>>("intLit", 1)));
+    EXPECT_TRUE(matchAndVerifyResultTrue(
+        Code, traverse(TK_IgnoreUnlessSpelledInSource, M),
+        std::make_unique<VerifyIdIsBoundTo<Expr>>("intLit", 1)));
+  }
+  {
+    auto M = expr(integerLiteral(equals(42)).bind("intLit"), expr());
+    EXPECT_TRUE(matchAndVerifyResultTrue(
+        Code, traverse(TK_AsIs, M),
+        std::make_unique<VerifyIdIsBoundTo<Expr>>("intLit", 1)));
+    EXPECT_TRUE(matchAndVerifyResultTrue(
+        Code, traverse(TK_IgnoreUnlessSpelledInSource, M),
+        std::make_unique<VerifyIdIsBoundTo<Expr>>("intLit", 1)));
+  }
+  {
+    auto M = expr(optionally(integerLiteral(equals(42)).bind("intLit")));
+    EXPECT_TRUE(matchAndVerifyResultTrue(
+        Code, traverse(TK_AsIs, M),
+        std::make_unique<VerifyIdIsBoundTo<Expr>>("intLit", 1)));
+    EXPECT_TRUE(matchAndVerifyResultTrue(
+        Code, traverse(TK_IgnoreUnlessSpelledInSource, M),
+        std::make_unique<VerifyIdIsBoundTo<Expr>>("intLit", 1)));
+  }
+  {
+    auto M = expr().bind("allExprs");
+    EXPECT_TRUE(matchAndVerifyResultTrue(
+        Code, traverse(TK_AsIs, M),
+        std::make_unique<VerifyIdIsBoundTo<Expr>>("allExprs", 7)));
+    EXPECT_TRUE(matchAndVerifyResultTrue(
+        Code, traverse(TK_IgnoreUnlessSpelledInSource, M),
+        std::make_unique<VerifyIdIsBoundTo<Expr>>("allExprs", 1)));
+  }
 }
 
 TEST(Traversal, traverseNoImplicit) {

From 0c01bbf4e9d517f4c6422647e6d7c362b621ea42 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Fri, 29 Jan 2021 11:20:04 -0500
Subject: [PATCH 115/318] Revert "Disable rosegment for old Android versions."

This reverts commit fae16fc0eed7cf60207901818cfe040116f2ef00.
Breaks building compiler-rt android runtimes with trunk clang
but older NDK, see discussion on https://reviews.llvm.org/D95166

(cherry picked from commit 1608ba09462d877111230e9461b895f696f8fcb1)
---
 clang/lib/Driver/ToolChains/Linux.cpp |  9 ---------
 clang/test/Driver/linux-ld.c          | 14 --------------
 2 files changed, 23 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp
index e17a6bd4bdd2..9663a7390ada 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -236,15 +236,6 @@ Linux::Linux(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
     ExtraOpts.push_back("relro");
   }
 
-  if (Triple.isAndroid() && Triple.isAndroidVersionLT(29)) {
-    // https://github.com/android/ndk/issues/1196
-    // The unwinder used by the crash handler on versions of Android prior to
-    // API 29 did not correctly handle binaries built with rosegment, which is
-    // enabled by default for LLD. Android only supports LLD, so it's not an
-    // issue that this flag is not accepted by other linkers.
-    ExtraOpts.push_back("--no-rosegment");
-  }
-
   // Android ARM/AArch64 use max-page-size=4096 to reduce VMA usage. Note, lld
   // from 11 onwards default max-page-size to 65536 for both ARM and AArch64.
   if ((Triple.isARM() || Triple.isAArch64()) && Triple.isAndroid()) {
diff --git a/clang/test/Driver/linux-ld.c b/clang/test/Driver/linux-ld.c
index 0b788ffcb852..24d3c78643f8 100644
--- a/clang/test/Driver/linux-ld.c
+++ b/clang/test/Driver/linux-ld.c
@@ -1089,20 +1089,6 @@
 // CHECK-ANDROID-HASH-STYLE-M: "{{.*}}ld{{(.exe)?}}"
 // CHECK-ANDROID-HASH-STYLE-M: "--hash-style=gnu"
 
-// Check that we pass --no-rosegment for pre-29 Android versions and do not for
-// 29+.
-// RUN: %clang %s -### -o %t.o 2>&1 \
-// RUN:     --target=armv7-linux-android28 \
-// RUN:   | FileCheck --check-prefix=CHECK-ANDROID-ROSEGMENT-28 %s
-// CHECK-ANDROID-ROSEGMENT-28: "{{.*}}ld{{(.exe)?}}"
-// CHECK-ANDROID-ROSEGMENT-28: "--no-rosegment"
-//
-// RUN: %clang %s -### -o %t.o 2>&1 \
-// RUN:     --target=armv7-linux-android29 \
-// RUN:   | FileCheck --check-prefix=CHECK-ANDROID-ROSEGMENT-29 %s
-// CHECK-ANDROID-ROSEGMENT-29: "{{.*}}ld{{(.exe)?}}"
-// CHECK-ANDROID-ROSEGMENT-29-NOT: "--no-rosegment"
-
 // RUN: %clang %s -### -o %t.o 2>&1 \
 // RUN:     --target=armv7-linux-android21 \
 // RUN:   | FileCheck --check-prefix=CHECK-ANDROID-NOEXECSTACK %s

From f290f3bfc5c900bd46992c9419c53a76f08a58cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 16 Feb 2021 15:00:54 +0200
Subject: [PATCH 116/318] doc: Add a release note for the changed comment char
 for aarch64-msvc targets

This was backported in a6ea391b832573830b011f26013ebaa946032250.
---
 llvm/docs/ReleaseNotes.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index f2eb53778406..c1bda3339a9e 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -111,6 +111,10 @@ During this release ...
   ``:lo12:`` relocation specifiers, to allow the assembly output
   to actually be assembled.
 
+* Changed the assembly comment string for MSVC targets to ``//`` (consistent
+  with the MinGW and ELF targets), freeing up ``;`` to be used as
+  statement separator.
+
 Changes to the ARM Backend
 --------------------------
 

From d5d089bf08c9181134d00e74d61a12f808c3e0c3 Mon Sep 17 00:00:00 2001
From: Maxim Kuvyrkov <maxim.kuvyrkov@linaro.org>
Date: Fri, 12 Feb 2021 09:47:37 +0000
Subject: [PATCH 117/318] Fix exegesis build on aarch64-windows-msvc host

Include x86 intrinsics only when compiling for x86_64
or i386.  _MSC_VER no longer implies x86.

Reviewed By: gchatelet

Differential Revision: https://reviews.llvm.org/D96498

Fixes: https://bugs.llvm.org/show_bug.cgi?id=49149

(cherry picked from commit 06f53f2f095c45c93d269b5dc010af506f4b0ff4)
---
 llvm/tools/llvm-exegesis/lib/X86/Target.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
index e8fb025f9611..15fa54e2f6a2 100644
--- a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -26,7 +26,7 @@
 #include <memory>
 #include <string>
 #include <vector>
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
 #include <immintrin.h>
 #include <intrin.h>
 #endif

From d44bf3332b314b8f325e6a04c268ae77691f8454 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 13 Feb 2021 11:33:14 +0000
Subject: [PATCH 118/318] [X86] Add reduced test case for PR49162

(cherry picked from commit 5ca3ef98a71598d368f6f4aaf0b385b50b67ce4a)
---
 llvm/test/CodeGen/X86/pr49162.ll | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/pr49162.ll

diff --git a/llvm/test/CodeGen/X86/pr49162.ll b/llvm/test/CodeGen/X86/pr49162.ll
new file mode 100644
index 000000000000..f186dc7dbe0b
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr49162.ll
@@ -0,0 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64
+
+define i32* @PR49162(i32* %base, i160* %ptr160) {
+; X86-LABEL: PR49162:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl 8(%eax), %ecx
+; X86-NEXT:    shll $16, %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    shldl $16, %ecx, %eax
+; X86-NEXT:    shll $2, %eax
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: PR49162:
+; X64:       # %bb.0:
+; X64-NEXT:    leaq -4(%rdi), %rax
+; X64-NEXT:    retq
+  %load160 = load i160, i160* %ptr160, align 4
+  %shl = shl i160 %load160, 80
+  %ashr160 = ashr i160 %shl, 112
+  %trunc = trunc i160 %ashr160 to i64
+  %ashr64 = ashr i64 %trunc, 32
+  %gep = getelementptr inbounds i32, i32* %base, i64 %ashr64
+  ret i32* %gep
+}

From d9910c24fe195c69c68ed2d9ec18cf17a7d60dc7 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 13 Feb 2021 11:59:52 +0000
Subject: [PATCH 119/318] [DAG] Fix shift amount limit in SimplifyDemandedBits
 trunc(shift(x,c)) to truncated bitwidth

We lost this in D56387/rG69bc0990a9181e6eb86228276d2f59435a7fae67 - where I got the src/dst bitwidths mixed up and assumed getValidShiftAmountConstant would catch it.

Patch by @craig.topper - confirmed by @Carrot that it fixes PR49162

(cherry picked from commit 7ad0c573bd4a68dc81886037457d47daa3d6aa24)
---
 llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 2 +-
 llvm/test/CodeGen/X86/pr49162.ll                 | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 5760132e44a0..7145fc91d5f3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -2012,7 +2012,7 @@ bool TargetLowering::SimplifyDemandedBits(
 
         const APInt *ShAmtC =
             TLO.DAG.getValidShiftAmountConstant(Src, DemandedElts);
-        if (!ShAmtC)
+        if (!ShAmtC || ShAmtC->uge(BitWidth))
           break;
         uint64_t ShVal = ShAmtC->getZExtValue();
 
diff --git a/llvm/test/CodeGen/X86/pr49162.ll b/llvm/test/CodeGen/X86/pr49162.ll
index f186dc7dbe0b..d3c187883b12 100644
--- a/llvm/test/CodeGen/X86/pr49162.ll
+++ b/llvm/test/CodeGen/X86/pr49162.ll
@@ -17,7 +17,11 @@ define i32* @PR49162(i32* %base, i160* %ptr160) {
 ;
 ; X64-LABEL: PR49162:
 ; X64:       # %bb.0:
-; X64-NEXT:    leaq -4(%rdi), %rax
+; X64-NEXT:    movl 8(%rsi), %eax
+; X64-NEXT:    shll $16, %eax
+; X64-NEXT:    cltq
+; X64-NEXT:    sarq $16, %rax
+; X64-NEXT:    leaq (%rdi,%rax,4), %rax
 ; X64-NEXT:    retq
   %load160 = load i160, i160* %ptr160, align 4
   %shl = shl i160 %load160, 80

From 439fd4bd6a757d2e18b60e502da523b8492f51ab Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 1 Feb 2021 18:17:25 +0000
Subject: [PATCH 120/318] [X86][AVX] Add 'OK' tests cases for PR48877

(cherry picked from commit e9514429a02b1e4f8b9d54b28a934bfa9bd246ec)
---
 llvm/test/MC/Disassembler/X86/x86-64-avx.txt  | 19 +++++++++++++++++++
 llvm/test/MC/Disassembler/X86/x86-64-avx2.txt |  7 +++++++
 2 files changed, 26 insertions(+)
 create mode 100644 llvm/test/MC/Disassembler/X86/x86-64-avx.txt
 create mode 100644 llvm/test/MC/Disassembler/X86/x86-64-avx2.txt

diff --git a/llvm/test/MC/Disassembler/X86/x86-64-avx.txt b/llvm/test/MC/Disassembler/X86/x86-64-avx.txt
new file mode 100644
index 000000000000..9ebb5335a22d
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/x86-64-avx.txt
@@ -0,0 +1,19 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64-apple-darwin9 | FileCheck %s
+
+# CHECK: vpackusdw (%rax), %xmm2, %xmm1
+0xc4 0xe2 0x69 0x2b 0x08
+
+# CHECK: vphsubd %xmm3, %xmm2, %xmm11
+0xc4 0x62 0x69 0x06 0xdb
+
+# CHECK: vpcmpestri $100, %xmm3, %xmm11
+0xc4 0x63 0x79 0x61 0xdb 0x64
+
+# CHECK: vpcmpestrm $100, %xmm3, %xmm11
+0xc4 0x63 0x79 0x60 0xdb 0x64
+
+# CHECK: vpcmpistri $100, %xmm3, %xmm11
+0xc4 0x63 0x79 0x63 0xdb 0x64
+
+# CHECK: vpcmpistrm $100, %xmm3, %xmm11
+0xc4 0x63 0x79 0x62 0xdb 0x64
diff --git a/llvm/test/MC/Disassembler/X86/x86-64-avx2.txt b/llvm/test/MC/Disassembler/X86/x86-64-avx2.txt
new file mode 100644
index 000000000000..a6a87f100633
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/x86-64-avx2.txt
@@ -0,0 +1,7 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64-apple-darwin9 | FileCheck %s
+
+# CHECK: vpackusdw %ymm0, %ymm2, %ymm1
+0xc4 0xe2 0x6d 0x2b 0xc8
+
+# CHECK: vphsubd %ymm0, %ymm2, %ymm1
+0xc4 0xe2 0x6d 0x06 0xc8

From fa9dc0c60cbc25b9fc4c6c5ff5f4ba81c7a612b4 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 2 Feb 2021 10:53:28 +0000
Subject: [PATCH 121/318] [X86][AVX] Add missing VEX_WIG tags from
 VPACKUSDW/VPHSUBD/VPCMPISTRI/VPCMPISTRM/VPCMPESTRI/VPCMPESTRM

Fixes PR48877

Differential Revision: https://reviews.llvm.org/D95801

(cherry picked from commit 4d904776a77aa80342c65cf72a962920cc9d1fa9)
---
 llvm/lib/Target/X86/X86InstrSSE.td            | 16 ++++++++--------
 llvm/test/MC/Disassembler/X86/x86-64-avx.txt  | 18 ++++++++++++++++++
 llvm/test/MC/Disassembler/X86/x86-64-avx2.txt |  6 ++++++
 3 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 7cf555748c46..a185a2007b72 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -3778,7 +3778,7 @@ let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
                              VEX_4V, VEX_WIG;
   defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128,
                              i128mem, SchedWriteShuffle.XMM, load, 0>,
-                             VEX_4V;
+                             VEX_4V, VEX_WIG;
 }
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
@@ -3794,7 +3794,7 @@ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
                               VEX_4V, VEX_L, VEX_WIG;
   defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256,
                               i256mem, SchedWriteShuffle.YMM, load, 0>,
-                              VEX_4V, VEX_L;
+                              VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -4756,7 +4756,7 @@ let isCommutable = 0 in {
                                   SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
   defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
                                   load, i128mem,
-                                  SchedWritePHAdd.XMM, 0>, VEX_4V;
+                                  SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
   defm VPSIGNB    : SS3I_binop_rm_int<0x08, "vpsignb",
                                       int_x86_ssse3_psign_b_128,
                                       SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
@@ -4802,7 +4802,7 @@ let isCommutable = 0 in {
                                   SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
                                   load, i256mem,
-                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L;
+                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPSIGNB   : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
                                        SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
   defm VPSIGNW   : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
@@ -6503,7 +6503,7 @@ multiclass pcmpistrm_SS42AI<string asm> {
 
 let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
   let Predicates = [HasAVX] in
-  defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
+  defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX, VEX_WIG;
   defm PCMPISTRM  : pcmpistrm_SS42AI<"pcmpistrm"> ;
 }
 
@@ -6521,7 +6521,7 @@ multiclass SS42AI_pcmpestrm<string asm> {
 
 let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
   let Predicates = [HasAVX] in
-  defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
+  defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX, VEX_WIG;
   defm PCMPESTRM :  SS42AI_pcmpestrm<"pcmpestrm">;
 }
 
@@ -6539,7 +6539,7 @@ multiclass SS42AI_pcmpistri<string asm> {
 
 let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
   let Predicates = [HasAVX] in
-  defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
+  defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX, VEX_WIG;
   defm PCMPISTRI  : SS42AI_pcmpistri<"pcmpistri">;
 }
 
@@ -6557,7 +6557,7 @@ multiclass SS42AI_pcmpestri<string asm> {
 
 let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
   let Predicates = [HasAVX] in
-  defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
+  defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX, VEX_WIG;
   defm PCMPESTRI  : SS42AI_pcmpestri<"pcmpestri">;
 }
 
diff --git a/llvm/test/MC/Disassembler/X86/x86-64-avx.txt b/llvm/test/MC/Disassembler/X86/x86-64-avx.txt
index 9ebb5335a22d..b7ab0b1a7dc6 100644
--- a/llvm/test/MC/Disassembler/X86/x86-64-avx.txt
+++ b/llvm/test/MC/Disassembler/X86/x86-64-avx.txt
@@ -3,17 +3,35 @@
 # CHECK: vpackusdw (%rax), %xmm2, %xmm1
 0xc4 0xe2 0x69 0x2b 0x08
 
+# CHECK: vpackusdw (%rax), %xmm2, %xmm1
+0xc4 0xe2 0xe9 0x2b 0x08
+
 # CHECK: vphsubd %xmm3, %xmm2, %xmm11
 0xc4 0x62 0x69 0x06 0xdb
 
+# CHECK: vphsubd %xmm3, %xmm2, %xmm11
+0xc4 0x62 0xe9 0x06 0xdb
+
 # CHECK: vpcmpestri $100, %xmm3, %xmm11
 0xc4 0x63 0x79 0x61 0xdb 0x64
 
+# CHECK: vpcmpestri $100, %xmm3, %xmm11
+0xc4 0x63 0xf9 0x61 0xdb 0x64
+
 # CHECK: vpcmpestrm $100, %xmm3, %xmm11
 0xc4 0x63 0x79 0x60 0xdb 0x64
 
+# CHECK: vpcmpestrm $100, %xmm3, %xmm11
+0xc4 0x63 0xf9 0x60 0xdb 0x64
+
 # CHECK: vpcmpistri $100, %xmm3, %xmm11
 0xc4 0x63 0x79 0x63 0xdb 0x64
 
+# CHECK: vpcmpistri $100, %xmm3, %xmm11
+0xc4 0x63 0xf9 0x63 0xdb 0x64
+
 # CHECK: vpcmpistrm $100, %xmm3, %xmm11
 0xc4 0x63 0x79 0x62 0xdb 0x64
+
+# CHECK: vpcmpistrm $100, %xmm3, %xmm11
+0xc4 0x63 0xf9 0x62 0xdb 0x64
diff --git a/llvm/test/MC/Disassembler/X86/x86-64-avx2.txt b/llvm/test/MC/Disassembler/X86/x86-64-avx2.txt
index a6a87f100633..d876067654fe 100644
--- a/llvm/test/MC/Disassembler/X86/x86-64-avx2.txt
+++ b/llvm/test/MC/Disassembler/X86/x86-64-avx2.txt
@@ -3,5 +3,11 @@
 # CHECK: vpackusdw %ymm0, %ymm2, %ymm1
 0xc4 0xe2 0x6d 0x2b 0xc8
 
+# CHECK: vpackusdw %ymm0, %ymm2, %ymm1
+0xc4 0xe2 0xed 0x2b 0xc8
+
 # CHECK: vphsubd %ymm0, %ymm2, %ymm1
 0xc4 0xe2 0x6d 0x06 0xc8
+
+# CHECK: vphsubd %ymm0, %ymm2, %ymm1
+0xc4 0xe2 0xed 0x06 0xc8

From f23ee06ec27e991f9f1fbb646f3a2288aeedee71 Mon Sep 17 00:00:00 2001
From: Sam McCall <sam.mccall@gmail.com>
Date: Tue, 2 Feb 2021 15:20:18 +0100
Subject: [PATCH 122/318] [clangd] Fix race in Global CDB shutdown

I believe the atomic write can be reordered after the notify, and that
seems to be happening on mac m1: http://45.33.8.238/macm1/2654/step_8.txt
In practice maybe seq_cst is enough? But no reason not to lock here.

https://bugs.llvm.org/show_bug.cgi?id=48998
(cherry picked from commit 6ac3fd9706047304c52a678884122a3a6bc55432)
---
 clang-tools-extra/clangd/GlobalCompilationDatabase.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp b/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp
index 1a5379acfe7d..542d0c3e4dbc 100644
--- a/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp
+++ b/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp
@@ -34,6 +34,7 @@
 #include <atomic>
 #include <chrono>
 #include <condition_variable>
+#include <mutex>
 #include <string>
 #include <tuple>
 #include <vector>
@@ -567,7 +568,10 @@ class DirectoryBasedGlobalCompilationDatabase::BroadcastThread {
   }
 
   ~BroadcastThread() {
-    ShouldStop.store(true, std::memory_order_release);
+    {
+      std::lock_guard<std::mutex> Lock(Mu);
+      ShouldStop.store(true, std::memory_order_release);
+    }
     CV.notify_all();
     Thread.join();
   }

From 440b16a4fc04a4bb614a60c26f42ab2ec27049a4 Mon Sep 17 00:00:00 2001
From: Jez Ng <jezng@fb.com>
Date: Tue, 2 Feb 2021 18:18:07 -0500
Subject: [PATCH 123/318] [lld-macho] Fill out release notes for 12.x

Differential Revision: https://reviews.llvm.org/D95900
---
 lld/docs/ReleaseNotes.rst | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index ea1403888eba..7c1cbc4a4c4b 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -58,10 +58,26 @@ MinGW Improvements
   (`D93950 <https://reviews.llvm.org/D93950>`_)
 
 
-MachO Improvements
+Mach-O Improvements
 ------------------
 
-* Item 1.
+We've gotten the new implementation of LLD for Mach-O to the point where it is
+able to link large x86_64 programs, and we'd love to get some alpha testing on
+it. The new Darwin back-end can be invoked as follows:
+
+.. code-block::
+   clang -fuse-ld=lld.darwinnew /path/to/file.c
+
+To reach this point, we implemented numerous features, and it's easier to list
+the major features we *haven't* yet completed:
+
+* LTO support
+* Stack unwinding for exceptions
+* Support for arm64, arm, and i386 architectures
+
+If you stumble upon an issue and it doesn't fall into one of these categories,
+please file a bug report!
+
 
 WebAssembly Improvements
 ------------------------

From 6584a9a4c55e10c055f9f450798b826a9624d82f Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Mon, 8 Feb 2021 16:33:46 +0000
Subject: [PATCH 124/318] [release][docs] Update contributions to LLVM 12 for
 scalable vectors.

Differential Revision: https://reviews.llvm.org/D96270
---
 clang/docs/ReleaseNotes.rst | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index f4ca8a855142..a43cc33988ab 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -144,6 +144,18 @@ New Pragmas in Clang
 
 - ...
 
+Modified Pragmas in Clang
+-------------------------
+
+- The "#pragma clang loop vectorize_width" has been extended to support an
+  optional 'fixed|scalable' argument, which can be used to indicate that the
+  compiler should use fixed-width or scalable vectorization.  Fixed-width is
+  assumed by default.
+
+  Scalable or vector length agnostic vectorization is an experimental feature
+  for targets that support scalable vectors. For more information please refer
+  to the Clang Language Extensions documentation.
+
 Attribute Changes in Clang
 --------------------------
 

From dda7ef025bc66ea326f5a8bda8c5b8534d21c2dd Mon Sep 17 00:00:00 2001
From: Lei Huang <lei@ca.ibm.com>
Date: Fri, 19 Feb 2021 19:24:05 +0000
Subject: [PATCH 125/318] [PowerPC] Update release notes for changes to PowerPC
 for V12.0

---
 llvm/docs/ReleaseNotes.rst | 70 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 69 insertions(+), 1 deletion(-)

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index c1bda3339a9e..542a505bfd2e 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -129,7 +129,75 @@ During this release ...
 Changes to the PowerPC Target
 -----------------------------
 
-During this release ...
+Optimization:
+
+* Made improvements to loop unroll-and-jam including fix to respect user
+  provided #pragma unroll-and-jam for loops on targets other than ARM.
+* Improved PartialInliner allowing it to handle code regions in a switch
+  statements.
+* Improved PGO support on AIX by building and linking with compiler-rt profile
+  library.
+* Add support for Epilogue Vectorization and enabled it by default.
+
+CodeGen:
+
+* POWER10 support
+  * Implementation of PC Relative addressing in LLD including the associated
+    linker optimizations.
+  * Add support for the new matrix multiplication (MMA) instructions to Clang
+    and LLVM.
+  * Implementation of Power10 builtins.
+
+* Scheduling enhancements
+  * Add a new algorithm to cluster more loads/stores if the DAG is not too
+    complicated.
+  * Enable the PowerPC scheduling heuristic for Power10.
+
+* Target dependent passes tuning
+  * Enhance LoopStrengthReduce/PPCLoopInstrFormPrep pass for PowerPC,
+    especially for P10 intrinsics.
+  * Enhance machine combiner pass to reduce register pressure for PowerPC.
+  * Improve MachineSink to do more sinking based on register pressure and alias
+    analysis.
+
+* General improvements
+  * Complete the constrained floating point operations support.
+  * Improve the llvm-exegesis support.
+  * Improve the stack clash protection to probe the gap between stackptr and
+    realigned stackptr.
+  * Improve the IEEE long double support for Power8.
+  * Enable MemorySSA for LoopSink.
+  * Enhance LLVM debugging functionality via options such as -print-changed and
+    -print-before-changed.
+  * Add builtins for Power9 (i.e. darn, xvtdiv, xvtsqrt etc).
+  * Add options to disable all or part of LoopIdiomRecognizePass.
+  * Add support for printing the DDG in DOT form allowing for visual inspection
+    of the Data Dependence Graph.
+  * Remove the QPX support.
+  * Significant number of bug fixes including all the fixes necessary to
+    achieve a clean test run for Julia.
+
+AIX Support:
+
+* Compiler-rt support
+  * Add support for building compiler-rt for AIX and 32-bit Power targets.
+  * Made compiler-rt the default rtlib for AIX.
+
+* General Improvements
+  * Enable the AIX extended AltiVec ABI under option -mabi=vec-extabi.
+  * Add partial C99 complex type support.
+  * Implemente traceback table for functions (encodes vector information,
+    emits exception handling).
+  * Implemente code generation for C++ dynamic initialization and finalization.
+    of non-local variables for use with the -bcdtors option of the AIX linker.
+  * Add new option -mignore-xcoff-visibility.
+  * Enable explicit sections on AIX.
+  * Enable -f[no-]data-sections on AIX and set -fdata-sections to be the default
+    on AIX.
+  * Enable -f[no-]function-sections.
+  * Add support for relocation generation using the large code model.
+  * Add pragma align natural and sorted out pragma pack stack effect.
+
 
 Changes to the X86 Target
 -------------------------

From c2a0b0810a40199ec94c90539b601ba72bcb3523 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Thu, 18 Feb 2021 21:25:14 +0100
Subject: [PATCH 126/318] [DCE] Add tests for non-willreturn function being
 removed (NFC)

(cherry picked from commit 4045ad6b0ccd35fe990d51b9bfdd9e7de109bdf5)
---
 llvm/test/Transforms/ADCE/willreturn.ll | 17 +++++++++++++++++
 llvm/test/Transforms/BDCE/willreturn.ll | 17 +++++++++++++++++
 2 files changed, 34 insertions(+)
 create mode 100644 llvm/test/Transforms/ADCE/willreturn.ll
 create mode 100644 llvm/test/Transforms/BDCE/willreturn.ll

diff --git a/llvm/test/Transforms/ADCE/willreturn.ll b/llvm/test/Transforms/ADCE/willreturn.ll
new file mode 100644
index 000000000000..c3482a417cb0
--- /dev/null
+++ b/llvm/test/Transforms/ADCE/willreturn.ll
@@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -adce -S < %s | FileCheck %s
+
+declare void @may_not_return(i32) nounwind readnone
+declare void @will_return(i32) nounwind readnone willreturn
+
+; FIXME: This is a miscompile.
+define void @test(i32 %a) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    ret void
+;
+  %b = add i32 %a, 1
+  call void @may_not_return(i32 %b)
+  %c = add i32 %b, 1
+  call void @will_return(i32 %c)
+  ret void
+}
diff --git a/llvm/test/Transforms/BDCE/willreturn.ll b/llvm/test/Transforms/BDCE/willreturn.ll
new file mode 100644
index 000000000000..b87ab0050e7a
--- /dev/null
+++ b/llvm/test/Transforms/BDCE/willreturn.ll
@@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -bdce -S < %s | FileCheck %s
+
+declare void @may_not_return(i32) nounwind readnone
+declare void @will_return(i32) nounwind readnone willreturn
+
+; FIXME: This is a miscompile.
+define void @test(i32 %a) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    ret void
+;
+  %b = add i32 %a, 1
+  call void @may_not_return(i32 %b)
+  %c = add i32 %b, 1
+  call void @will_return(i32 %c)
+  ret void
+}

From d1d7dc779a296001568d855bba7843a9eb94a585 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Thu, 18 Feb 2021 22:15:17 +0100
Subject: [PATCH 127/318] [IR] Move willReturn() to Instruction

This moves the willReturn() helper from CallBase to Instruction,
so that it can be used in a more generic manner. This will make
it easier to fix additional passes (ADCE and BDCE), and will give
us one place to change if additional instructions should become
non-willreturn (e.g. there has been talk about handling volatile
operations this way).

I have also included the IntrinsicInst workaround directly in
here, so that it gets applied consistently. (As such this change
is not entirely NFC -- FuncAttrs will now use this as well.)

Differential Revision: https://reviews.llvm.org/D96992

(cherry picked from commit 370addb996138a9e3634899cf264c7621307617a)
---
 llvm/include/llvm/IR/InstrTypes.h         |  3 ---
 llvm/include/llvm/IR/Instruction.h        |  4 ++++
 llvm/lib/Analysis/ValueTracking.cpp       | 28 +++--------------------
 llvm/lib/IR/Instruction.cpp               | 10 ++++++++
 llvm/lib/Transforms/IPO/FunctionAttrs.cpp |  3 +--
 llvm/lib/Transforms/Utils/Local.cpp       |  9 ++------
 6 files changed, 20 insertions(+), 37 deletions(-)

diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
index f42ef48de6b3..955ac8e537fe 100644
--- a/llvm/include/llvm/IR/InstrTypes.h
+++ b/llvm/include/llvm/IR/InstrTypes.h
@@ -1757,9 +1757,6 @@ class CallBase : public Instruction {
     return doesNotAccessMemory() || hasFnAttr(Attribute::ReadOnly);
   }
 
-  /// Returns true if this function is guaranteed to return.
-  bool willReturn() const { return hasFnAttr(Attribute::WillReturn); }
-
   void setOnlyReadsMemory() {
     addAttribute(AttributeList::FunctionIndex, Attribute::ReadOnly);
   }
diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h
index d2a55f89fac9..85afaed5225e 100644
--- a/llvm/include/llvm/IR/Instruction.h
+++ b/llvm/include/llvm/IR/Instruction.h
@@ -633,6 +633,10 @@ class Instruction : public User,
   /// generated program.
   bool isSafeToRemove() const;
 
+  /// Return true if the instruction will return (unwinding is considered as
+  /// a form of returning control flow here).
+  bool willReturn() const;
+
   /// Return true if the instruction is a variety of EH-block.
   bool isEHPad() const {
     switch (getOpcode()) {
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 5600a3b33750..e174c5efe424 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -5018,36 +5018,14 @@ bool llvm::isGuaranteedToTransferExecutionToSuccessor(const Instruction *I) {
   // arbitrary length of time, but programs aren't allowed to rely on that.
 
   // If there is no successor, then execution can't transfer to it.
-  if (const auto *CRI = dyn_cast<CleanupReturnInst>(I))
-    return !CRI->unwindsToCaller();
-  if (const auto *CatchSwitch = dyn_cast<CatchSwitchInst>(I))
-    return !CatchSwitch->unwindsToCaller();
-  if (isa<ResumeInst>(I))
-    return false;
   if (isa<ReturnInst>(I))
     return false;
   if (isa<UnreachableInst>(I))
     return false;
 
-  // Calls can throw, or contain an infinite loop, or kill the process.
-  if (const auto *CB = dyn_cast<CallBase>(I)) {
-    // Call sites that throw have implicit non-local control flow.
-    if (!CB->doesNotThrow())
-      return false;
-
-    // A function which doens't throw and has "willreturn" attribute will
-    // always return.
-    if (CB->hasFnAttr(Attribute::WillReturn))
-      return true;
-
-    // FIXME: Temporarily assume that all side-effect free intrinsics will
-    // return. Remove this workaround once all intrinsics are appropriately
-    // annotated.
-    return isa<IntrinsicInst>(CB) && CB->onlyReadsMemory();
-  }
-
-  // Other instructions return normally.
-  return true;
+  // An instruction that returns without throwing must transfer control flow
+  // to a successor.
+  return !I->mayThrow() && I->willReturn();
 }
 
 bool llvm::isGuaranteedToTransferExecutionToSuccessor(const BasicBlock *BB) {
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 1e3fcd672a43..246180e72172 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -633,6 +633,16 @@ bool Instruction::isSafeToRemove() const {
          !this->isTerminator();
 }
 
+bool Instruction::willReturn() const {
+  if (const auto *CB = dyn_cast<CallBase>(this))
+    // FIXME: Temporarily assume that all side-effect free intrinsics will
+    // return. Remove this workaround once all intrinsics are appropriately
+    // annotated.
+    return CB->hasFnAttr(Attribute::WillReturn) ||
+           (isa<IntrinsicInst>(CB) && CB->onlyReadsMemory());
+  return true;
+}
+
 bool Instruction::isLifetimeStartOrEnd() const {
   auto II = dyn_cast<IntrinsicInst>(this);
   if (!II)
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index 30a1f81ad0e1..256acd7e1d17 100644
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -1445,8 +1445,7 @@ static bool functionWillReturn(const Function &F) {
   // If there are no loops, then the function is willreturn if all calls in
   // it are willreturn.
   return all_of(instructions(F), [](const Instruction &I) {
-    const auto *CB = dyn_cast<CallBase>(&I);
-    return !CB || CB->hasFnAttr(Attribute::WillReturn);
+    return I.willReturn();
   });
 }
 
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 477ea458c763..d055f3dd3084 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -420,13 +420,8 @@ bool llvm::wouldInstructionBeTriviallyDead(Instruction *I,
     return true;
   }
 
-  if (auto *CB = dyn_cast<CallBase>(I)) {
-    // Treat calls that may not return as alive.
-    // TODO: Remove the intrinsic escape hatch once all intrinsics set
-    // willreturn properly.
-    if (!CB->willReturn() && !isa<IntrinsicInst>(I))
-      return false;
-  }
+  if (!I->willReturn())
+    return false;
 
   if (!I->mayHaveSideEffects())
     return true;

From 8e9c2ad95eb5ab439b933d8c793957bc4d82e456 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Thu, 18 Feb 2021 22:29:19 +0100
Subject: [PATCH 128/318] [DCE] Don't remove non-willreturn calls

In both ADCE and BDCE (via DemandedBits) we should not remove
instructions that are not guaranteed to return. This issue was
pointed out by fhahn in the recent llvm-dev thread.

Differential Revision: https://reviews.llvm.org/D96993

(cherry picked from commit 2f17ed294fcd8cde505b93c9c5bbab06ba59051c)
---
 llvm/lib/Analysis/DemandedBits.cpp         | 2 +-
 llvm/lib/Transforms/Scalar/ADCE.cpp        | 2 +-
 llvm/test/Feature/OperandBundles/adce.ll   | 4 ++--
 llvm/test/LTO/X86/parallel.ll              | 4 ++--
 llvm/test/Transforms/ADCE/dce_pure_call.ll | 2 +-
 llvm/test/Transforms/ADCE/willreturn.ll    | 3 ++-
 llvm/test/Transforms/BDCE/dce-pure.ll      | 2 +-
 llvm/test/Transforms/BDCE/dead-void-ro.ll  | 2 +-
 llvm/test/Transforms/BDCE/willreturn.ll    | 3 ++-
 llvm/test/tools/gold/X86/parallel.ll       | 4 ++--
 10 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Analysis/DemandedBits.cpp b/llvm/lib/Analysis/DemandedBits.cpp
index 461fd7239905..dd11b0b02bf8 100644
--- a/llvm/lib/Analysis/DemandedBits.cpp
+++ b/llvm/lib/Analysis/DemandedBits.cpp
@@ -80,7 +80,7 @@ void DemandedBitsWrapperPass::print(raw_ostream &OS, const Module *M) const {
 
 static bool isAlwaysLive(Instruction *I) {
   return I->isTerminator() || isa<DbgInfoIntrinsic>(I) || I->isEHPad() ||
-         I->mayHaveSideEffects();
+         I->mayHaveSideEffects() || !I->willReturn();
 }
 
 void DemandedBits::determineLiveOperandBits(
diff --git a/llvm/lib/Transforms/Scalar/ADCE.cpp b/llvm/lib/Transforms/Scalar/ADCE.cpp
index 2b649732a799..ce4e5e575fbf 100644
--- a/llvm/lib/Transforms/Scalar/ADCE.cpp
+++ b/llvm/lib/Transforms/Scalar/ADCE.cpp
@@ -325,7 +325,7 @@ void AggressiveDeadCodeElimination::initialize() {
 
 bool AggressiveDeadCodeElimination::isAlwaysLive(Instruction &I) {
   // TODO -- use llvm::isInstructionTriviallyDead
-  if (I.isEHPad() || I.mayHaveSideEffects()) {
+  if (I.isEHPad() || I.mayHaveSideEffects() || !I.willReturn()) {
     // Skip any value profile instrumentation calls if they are
     // instrumenting constants.
     if (isInstrumentsConstant(I))
diff --git a/llvm/test/Feature/OperandBundles/adce.ll b/llvm/test/Feature/OperandBundles/adce.ll
index a729ba710689..fa4e045fdd1e 100644
--- a/llvm/test/Feature/OperandBundles/adce.ll
+++ b/llvm/test/Feature/OperandBundles/adce.ll
@@ -5,8 +5,8 @@
 ; bundles since the presence of unknown operand bundles implies
 ; arbitrary memory effects.
 
-declare void @readonly_function() readonly nounwind
-declare void @readnone_function() readnone nounwind
+declare void @readonly_function() readonly nounwind willreturn
+declare void @readnone_function() readnone nounwind willreturn
 
 define void @test0() {
 ; CHECK-LABEL: @test0(
diff --git a/llvm/test/LTO/X86/parallel.ll b/llvm/test/LTO/X86/parallel.ll
index b3c128193821..34235ec0202b 100644
--- a/llvm/test/LTO/X86/parallel.ll
+++ b/llvm/test/LTO/X86/parallel.ll
@@ -11,7 +11,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK0-NOT: bar
 ; CHECK0: T foo
 ; CHECK0-NOT: bar
-define void @foo() {
+define void @foo() mustprogress {
   call void @bar()
   ret void
 }
@@ -19,7 +19,7 @@ define void @foo() {
 ; CHECK1-NOT: foo
 ; CHECK1: T bar
 ; CHECK1-NOT: foo
-define void @bar() {
+define void @bar() mustprogress {
   call void @foo()
   ret void
 }
diff --git a/llvm/test/Transforms/ADCE/dce_pure_call.ll b/llvm/test/Transforms/ADCE/dce_pure_call.ll
index 66483abbc919..88e92bf13f49 100644
--- a/llvm/test/Transforms/ADCE/dce_pure_call.ll
+++ b/llvm/test/Transforms/ADCE/dce_pure_call.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -adce -S < %s | not grep call
 
-declare i32 @strlen(i8*) readonly nounwind
+declare i32 @strlen(i8*) readonly nounwind willreturn
 
 define void @test() {
 	call i32 @strlen( i8* null )		; <i32>:1 [#uses=0]
diff --git a/llvm/test/Transforms/ADCE/willreturn.ll b/llvm/test/Transforms/ADCE/willreturn.ll
index c3482a417cb0..61bbbe0ae5fa 100644
--- a/llvm/test/Transforms/ADCE/willreturn.ll
+++ b/llvm/test/Transforms/ADCE/willreturn.ll
@@ -4,9 +4,10 @@
 declare void @may_not_return(i32) nounwind readnone
 declare void @will_return(i32) nounwind readnone willreturn
 
-; FIXME: This is a miscompile.
 define void @test(i32 %a) {
 ; CHECK-LABEL: @test(
+; CHECK-NEXT:    [[B:%.*]] = add i32 [[A:%.*]], 1
+; CHECK-NEXT:    call void @may_not_return(i32 [[B]])
 ; CHECK-NEXT:    ret void
 ;
   %b = add i32 %a, 1
diff --git a/llvm/test/Transforms/BDCE/dce-pure.ll b/llvm/test/Transforms/BDCE/dce-pure.ll
index a487a04db611..e00121d0c9e9 100644
--- a/llvm/test/Transforms/BDCE/dce-pure.ll
+++ b/llvm/test/Transforms/BDCE/dce-pure.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -bdce -S < %s | FileCheck %s
 ; RUN: opt -passes=bdce -S < %s | FileCheck %s
 
-declare i32 @strlen(i8*) readonly nounwind
+declare i32 @strlen(i8*) readonly nounwind willreturn
 
 define void @test1() {
   call i32 @strlen( i8* null )
diff --git a/llvm/test/Transforms/BDCE/dead-void-ro.ll b/llvm/test/Transforms/BDCE/dead-void-ro.ll
index 36f09511503b..77f4e097f4bb 100644
--- a/llvm/test/Transforms/BDCE/dead-void-ro.ll
+++ b/llvm/test/Transforms/BDCE/dead-void-ro.ll
@@ -14,5 +14,5 @@ define void @PR34211(i16* %p) {
 
 declare void @no_side_effects_so_dead(i16) #0
 
-attributes #0 = { nounwind readnone }
+attributes #0 = { nounwind readnone willreturn }
 
diff --git a/llvm/test/Transforms/BDCE/willreturn.ll b/llvm/test/Transforms/BDCE/willreturn.ll
index b87ab0050e7a..5efd6ad6e0cf 100644
--- a/llvm/test/Transforms/BDCE/willreturn.ll
+++ b/llvm/test/Transforms/BDCE/willreturn.ll
@@ -4,9 +4,10 @@
 declare void @may_not_return(i32) nounwind readnone
 declare void @will_return(i32) nounwind readnone willreturn
 
-; FIXME: This is a miscompile.
 define void @test(i32 %a) {
 ; CHECK-LABEL: @test(
+; CHECK-NEXT:    [[B:%.*]] = add i32 [[A:%.*]], 1
+; CHECK-NEXT:    call void @may_not_return(i32 [[B]])
 ; CHECK-NEXT:    ret void
 ;
   %b = add i32 %a, 1
diff --git a/llvm/test/tools/gold/X86/parallel.ll b/llvm/test/tools/gold/X86/parallel.ll
index 6972efc652a3..b8072f01e5a2 100644
--- a/llvm/test/tools/gold/X86/parallel.ll
+++ b/llvm/test/tools/gold/X86/parallel.ll
@@ -14,7 +14,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK0-NOT: bar
 ; CHECK0: T foo
 ; CHECK0-NOT: bar
-define void @foo() {
+define void @foo() mustprogress {
   call void @bar()
   ret void
 }
@@ -24,7 +24,7 @@ define void @foo() {
 ; CHECK1-NOT: foo
 ; CHECK1: T bar
 ; CHECK1-NOT: foo
-define void @bar() {
+define void @bar() mustprogress {
   call void @foo()
   ret void
 }

From 17daef8bfdfd3a78465122f968a93df6db42dca6 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Fri, 19 Feb 2021 13:06:45 +0100
Subject: [PATCH 129/318] [LLD] Fix tests after D96993

We now need mustprogress to eliminate these calls. The code doesn't
really make sense, but that's not the point of the test...

(cherry picked from commit ac065b7a37d6dd8daacd526f6c3a0d1563bc88ac)
---
 lld/test/ELF/lto/parallel.ll  | 4 ++--
 lld/test/wasm/lto/parallel.ll | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/lld/test/ELF/lto/parallel.ll b/lld/test/ELF/lto/parallel.ll
index d9cb4fed7bfa..d89431e8b4a1 100644
--- a/lld/test/ELF/lto/parallel.ll
+++ b/lld/test/ELF/lto/parallel.ll
@@ -14,7 +14,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK0-NOT: bar
 ; CHECK0: T foo
 ; CHECK0-NOT: bar
-define void @foo() {
+define void @foo() mustprogress {
   call void @bar()
   ret void
 }
@@ -22,7 +22,7 @@ define void @foo() {
 ; CHECK1-NOT: foo
 ; CHECK1: T bar
 ; CHECK1-NOT: foo
-define void @bar() {
+define void @bar() mustprogress {
   call void @foo()
   ret void
 }
diff --git a/lld/test/wasm/lto/parallel.ll b/lld/test/wasm/lto/parallel.ll
index a93c3558d969..261cf2ef7dae 100644
--- a/lld/test/wasm/lto/parallel.ll
+++ b/lld/test/wasm/lto/parallel.ll
@@ -10,7 +10,7 @@ target triple = "wasm32-unknown-unknown-wasm"
 ; CHECK0-NOT: bar
 ; CHECK0: T foo
 ; CHECK0-NOT: bar
-define void @foo() {
+define void @foo() mustprogress {
   call void @bar()
   ret void
 }
@@ -18,7 +18,7 @@ define void @foo() {
 ; CHECK1-NOT: foo
 ; CHECK1: T bar
 ; CHECK1-NOT: foo
-define void @bar() {
+define void @bar() mustprogress {
   call void @foo()
   ret void
 }

From a338d577bb4fbf9013cf0c22c211d25bf3c41a26 Mon Sep 17 00:00:00 2001
From: Jeroen Dobbelaere <jeroen.dobbelaere@synopsys.com>
Date: Thu, 18 Feb 2021 17:29:46 +0100
Subject: [PATCH 130/318] [clang] functions with the 'const' or 'pure'
 attribute must always return.

As described in
* https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-pure-function-attribute
* https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-const-function-attribute

An `__attribute__((pure))` function must always return, as well as an `__attribute__((const))` function.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D96960

(cherry picked from commit 46757ccb49ab88da54ca8ddd43665d5255ee80f7)
---
 clang/lib/CodeGen/CGCall.cpp                           | 5 +++++
 clang/test/CodeGen/complex-builtins.c                  | 3 ++-
 clang/test/CodeGen/complex-libcalls.c                  | 3 ++-
 clang/test/CodeGen/function-attributes.c               | 2 +-
 clang/test/CodeGenCXX/2009-05-04-PureConstNounwind.cpp | 8 ++++----
 clang/test/Sema/libbuiltins-ctype-powerpc64.c          | 2 +-
 clang/test/Sema/libbuiltins-ctype-x86_64.c             | 2 +-
 7 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 42801372189b..bc7582c67989 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -1995,9 +1995,14 @@ void CodeGenModule::ConstructAttributeList(
     if (TargetDecl->hasAttr<ConstAttr>()) {
       FuncAttrs.addAttribute(llvm::Attribute::ReadNone);
       FuncAttrs.addAttribute(llvm::Attribute::NoUnwind);
+      // gcc specifies that 'const' functions have greater restrictions than
+      // 'pure' functions, so they also cannot have infinite loops.
+      FuncAttrs.addAttribute(llvm::Attribute::WillReturn);
     } else if (TargetDecl->hasAttr<PureAttr>()) {
       FuncAttrs.addAttribute(llvm::Attribute::ReadOnly);
       FuncAttrs.addAttribute(llvm::Attribute::NoUnwind);
+      // gcc specifies that 'pure' functions cannot have infinite loops.
+      FuncAttrs.addAttribute(llvm::Attribute::WillReturn);
     } else if (TargetDecl->hasAttr<NoAliasAttr>()) {
       FuncAttrs.addAttribute(llvm::Attribute::ArgMemOnly);
       FuncAttrs.addAttribute(llvm::Attribute::NoUnwind);
diff --git a/clang/test/CodeGen/complex-builtins.c b/clang/test/CodeGen/complex-builtins.c
index 96c0e7117016..6fea8a9f028c 100644
--- a/clang/test/CodeGen/complex-builtins.c
+++ b/clang/test/CodeGen/complex-builtins.c
@@ -133,7 +133,7 @@ void foo(float f) {
 // NO__ERRNO: declare { x86_fp80, x86_fp80 } @cprojl({ x86_fp80, x86_fp80 }* byval({ x86_fp80, x86_fp80 }) align 16) [[NOT_READNONE]]
 // HAS_ERRNO: declare { double, double } @cproj(double, double) [[READNONE:#[0-9]+]]
 // HAS_ERRNO: declare <2 x float> @cprojf(<2 x float>) [[READNONE]]
-// HAS_ERRNO: declare { x86_fp80, x86_fp80 } @cprojl({ x86_fp80, x86_fp80 }* byval({ x86_fp80, x86_fp80 }) align 16) [[NOT_READNONE]]
+// HAS_ERRNO: declare { x86_fp80, x86_fp80 } @cprojl({ x86_fp80, x86_fp80 }* byval({ x86_fp80, x86_fp80 }) align 16) [[WILLRETURN_NOT_READNONE:#[0-9]+]]
 
   __builtin_cpow(f,f);       __builtin_cpowf(f,f);      __builtin_cpowl(f,f);
 
@@ -202,3 +202,4 @@ void foo(float f) {
 
 // HAS_ERRNO: attributes [[NOT_READNONE]] = { nounwind {{.*}} }
 // HAS_ERRNO: attributes [[READNONE]] = { {{.*}}readnone{{.*}} }
+// HAS_ERRNO: attributes [[WILLRETURN_NOT_READNONE]] = { nounwind willreturn {{.*}} }
diff --git a/clang/test/CodeGen/complex-libcalls.c b/clang/test/CodeGen/complex-libcalls.c
index 9bd419a83821..44d6849c0a71 100644
--- a/clang/test/CodeGen/complex-libcalls.c
+++ b/clang/test/CodeGen/complex-libcalls.c
@@ -133,7 +133,7 @@ void foo(float f) {
 // NO__ERRNO: declare { x86_fp80, x86_fp80 } @cprojl({ x86_fp80, x86_fp80 }* byval({ x86_fp80, x86_fp80 }) align 16) [[NOT_READNONE]]
 // HAS_ERRNO: declare { double, double } @cproj(double, double) [[READNONE:#[0-9]+]]
 // HAS_ERRNO: declare <2 x float> @cprojf(<2 x float>) [[READNONE]]
-// HAS_ERRNO: declare { x86_fp80, x86_fp80 } @cprojl({ x86_fp80, x86_fp80 }* byval({ x86_fp80, x86_fp80 }) align 16) [[NOT_READNONE]]
+// HAS_ERRNO: declare { x86_fp80, x86_fp80 } @cprojl({ x86_fp80, x86_fp80 }* byval({ x86_fp80, x86_fp80 }) align 16) [[WILLRETURN_NOT_READNONE:#[0-9]+]]
 
   cpow(f,f);       cpowf(f,f);      cpowl(f,f);
 
@@ -202,3 +202,4 @@ void foo(float f) {
 
 // HAS_ERRNO: attributes [[NOT_READNONE]] = { nounwind {{.*}} }
 // HAS_ERRNO: attributes [[READNONE]] = { {{.*}}readnone{{.*}} }
+// HAS_ERRNO: attributes [[WILLRETURN_NOT_READNONE]] = { nounwind willreturn {{.*}} }
diff --git a/clang/test/CodeGen/function-attributes.c b/clang/test/CodeGen/function-attributes.c
index ffb86a6cd272..f14f24801006 100644
--- a/clang/test/CodeGen/function-attributes.c
+++ b/clang/test/CodeGen/function-attributes.c
@@ -115,5 +115,5 @@ void f20(void) {
 // CHECK: attributes [[SR]] = { nounwind optsize{{.*}} "stackrealign"{{.*}} }
 // CHECK: attributes [[RT]] = { nounwind optsize returns_twice{{.*}} }
 // CHECK: attributes [[NR]] = { noreturn optsize }
-// CHECK: attributes [[NUW_RN]] = { nounwind optsize readnone }
+// CHECK: attributes [[NUW_RN]] = { nounwind optsize readnone willreturn }
 // CHECK: attributes [[RT_CALL]] = { optsize returns_twice }
diff --git a/clang/test/CodeGenCXX/2009-05-04-PureConstNounwind.cpp b/clang/test/CodeGenCXX/2009-05-04-PureConstNounwind.cpp
index 25400a552e5d..e1d539608fcc 100644
--- a/clang/test/CodeGenCXX/2009-05-04-PureConstNounwind.cpp
+++ b/clang/test/CodeGenCXX/2009-05-04-PureConstNounwind.cpp
@@ -15,8 +15,8 @@ int f(void) {
 // CHECK: declare i32 @_Z1tv() [[TF2:#[0-9]+]]
 
 // CHECK: attributes [[TF]] = { {{.*}} }
-// CHECK: attributes [[NUW_RN]] = { nounwind readnone{{.*}} }
-// CHECK: attributes [[NUW_RO]] = { nounwind readonly{{.*}} }
+// CHECK: attributes [[NUW_RN]] = { nounwind readnone willreturn{{.*}} }
+// CHECK: attributes [[NUW_RO]] = { nounwind readonly willreturn{{.*}} }
 // CHECK: attributes [[TF2]] = { {{.*}} }
-// CHECK: attributes [[NUW_RN_CALL]] = { nounwind readnone }
-// CHECK: attributes [[NUW_RO_CALL]] = { nounwind readonly }
+// CHECK: attributes [[NUW_RN_CALL]] = { nounwind readnone willreturn }
+// CHECK: attributes [[NUW_RO_CALL]] = { nounwind readonly willreturn }
diff --git a/clang/test/Sema/libbuiltins-ctype-powerpc64.c b/clang/test/Sema/libbuiltins-ctype-powerpc64.c
index bfd79acb0ab0..ba0efb205944 100644
--- a/clang/test/Sema/libbuiltins-ctype-powerpc64.c
+++ b/clang/test/Sema/libbuiltins-ctype-powerpc64.c
@@ -62,4 +62,4 @@ void test(int x) {
 // CHECK: declare signext i32 @toupper(i32 signext) [[NUW_RO:#[0-9]+]]
 
 // CHECK: attributes [[NUW_RO]] = { nounwind readonly{{.*}} }
-// CHECK: attributes [[NUW_RO_CALL]] = { nounwind readonly }
+// CHECK: attributes [[NUW_RO_CALL]] = { nounwind readonly willreturn }
diff --git a/clang/test/Sema/libbuiltins-ctype-x86_64.c b/clang/test/Sema/libbuiltins-ctype-x86_64.c
index 4934e6f16752..b8a2c7e81584 100644
--- a/clang/test/Sema/libbuiltins-ctype-x86_64.c
+++ b/clang/test/Sema/libbuiltins-ctype-x86_64.c
@@ -62,4 +62,4 @@ void test(int x) {
 // CHECK: declare i32 @toupper(i32) [[NUW_RO:#[0-9]+]]
 
 // CHECK: attributes [[NUW_RO]] = { nounwind readonly{{.*}} }
-// CHECK: attributes [[NUW_RO_CALL]] = { nounwind readonly }
+// CHECK: attributes [[NUW_RO_CALL]] = { nounwind readonly willreturn }

From 2f74c22048277d255078d376b55dd40dddbaa376 Mon Sep 17 00:00:00 2001
From: Shilei Tian <tianshilei1992@gmail.com>
Date: Thu, 18 Feb 2021 21:04:32 -0500
Subject: [PATCH 131/318] [OpenMP][NVPTX] Add the support for CUDA 11.2 and
 CUDA 11.1

CUDA 11.2 and CUDA 11.1 are all available now.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D97004

(cherry picked from commit 89827fd404f954605663776e746ec351bde61348)
---
 openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
index b705e0bb6a9f..5478cd3f6aea 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -152,8 +152,8 @@ add_custom_target(omptarget-nvptx-bc)
 
 # This map is from clang/lib/Driver/ToolChains/Cuda.cpp.
 # The last element is the default case.
-set(cuda_version_list 110 102 101 100 92 91 90 80)
-set(ptx_feature_list 70 65 64 63 61 61 60 42)
+set(cuda_version_list 112 111 110 102 101 100 92 91 90 80)
+set(ptx_feature_list 71 71 70 65 64 63 61 61 60 42)
 # The following two lines of ugly code is not needed when the minimal CMake
 # version requirement is 3.17+.
 list(LENGTH cuda_version_list num_version_supported)

From 34e8fd50391923ec4d81ec988376588885107071 Mon Sep 17 00:00:00 2001
From: Kadir Cetinkaya <kadircet@google.com>
Date: Fri, 22 Jan 2021 15:20:52 +0100
Subject: [PATCH 132/318] [clangd] Treat "null" optional fields as missing

Clangd currently throws away any protocol messages whenever an optional
field has an unexpected type. This patch changes the behaviour to treat
`null` fields as missing.

This enables clangd to be more tolerant against small violations to the
LSP spec.

Fixes https://github.com/clangd/vscode-clangd/issues/134

Differential Revision: https://reviews.llvm.org/D95229

(cherry picked from commit af20232b8e189335da571f48c2467b244b7fd772)
---
 clang-tools-extra/clangd/Protocol.cpp | 46 ++++++++++++++++++---------
 1 file changed, 31 insertions(+), 15 deletions(-)

diff --git a/clang-tools-extra/clangd/Protocol.cpp b/clang-tools-extra/clangd/Protocol.cpp
index 78110dc0de60..76cf813e6808 100644
--- a/clang-tools-extra/clangd/Protocol.cpp
+++ b/clang-tools-extra/clangd/Protocol.cpp
@@ -27,6 +27,21 @@
 
 namespace clang {
 namespace clangd {
+namespace {
+
+// Helper that doesn't treat `null` and absent fields as failures.
+template <typename T>
+bool mapOptOrNull(const llvm::json::Value &Params, llvm::StringLiteral Prop,
+                  T &Out, llvm::json::Path P) {
+  auto *O = Params.getAsObject();
+  assert(O);
+  auto *V = O->get(Prop);
+  // Field is missing or null.
+  if (!V || V->getAsNull().hasValue())
+    return true;
+  return fromJSON(*V, Out, P.field(Prop));
+}
+} // namespace
 
 char LSPError::ID;
 
@@ -490,7 +505,7 @@ bool fromJSON(const llvm::json::Value &Params, DidChangeTextDocumentParams &R,
   return O && O.map("textDocument", R.textDocument) &&
          O.map("contentChanges", R.contentChanges) &&
          O.map("wantDiagnostics", R.wantDiagnostics) &&
-         O.mapOptional("forceRebuild", R.forceRebuild);
+         mapOptOrNull(Params, "forceRebuild", R.forceRebuild, P);
 }
 
 bool fromJSON(const llvm::json::Value &E, FileChangeType &Out,
@@ -580,10 +595,10 @@ bool fromJSON(const llvm::json::Value &Params, Diagnostic &R,
               llvm::json::Path P) {
   llvm::json::ObjectMapper O(Params, P);
   return O && O.map("range", R.range) && O.map("message", R.message) &&
-         O.mapOptional("severity", R.severity) &&
-         O.mapOptional("category", R.category) &&
-         O.mapOptional("code", R.code) && O.mapOptional("source", R.source);
-  return true;
+         mapOptOrNull(Params, "severity", R.severity, P) &&
+         mapOptOrNull(Params, "category", R.category, P) &&
+         mapOptOrNull(Params, "code", R.code, P) &&
+         mapOptOrNull(Params, "source", R.source, P);
 }
 
 llvm::json::Value toJSON(const PublishDiagnosticsParams &PDP) {
@@ -818,7 +833,7 @@ bool fromJSON(const llvm::json::Value &Params, CompletionContext &R,
   llvm::json::ObjectMapper O(Params, P);
   int TriggerKind;
   if (!O || !O.map("triggerKind", TriggerKind) ||
-      !O.mapOptional("triggerCharacter", R.triggerCharacter))
+      !mapOptOrNull(Params, "triggerCharacter", R.triggerCharacter, P))
     return false;
   R.triggerKind = static_cast<CompletionTriggerKind>(TriggerKind);
   return true;
@@ -1121,8 +1136,8 @@ bool fromJSON(const llvm::json::Value &Params, ConfigurationSettings &S,
   llvm::json::ObjectMapper O(Params, P);
   if (!O)
     return true; // 'any' type in LSP.
-  return O.mapOptional("compilationDatabaseChanges",
-                       S.compilationDatabaseChanges);
+  return mapOptOrNull(Params, "compilationDatabaseChanges",
+                      S.compilationDatabaseChanges, P);
 }
 
 bool fromJSON(const llvm::json::Value &Params, InitializationOptions &Opts,
@@ -1133,8 +1148,8 @@ bool fromJSON(const llvm::json::Value &Params, InitializationOptions &Opts,
 
   return fromJSON(Params, Opts.ConfigSettings, P) &&
          O.map("compilationDatabasePath", Opts.compilationDatabasePath) &&
-         O.mapOptional("fallbackFlags", Opts.fallbackFlags) &&
-         O.mapOptional("clangdFileStatus", Opts.FileStatus);
+         mapOptOrNull(Params, "fallbackFlags", Opts.fallbackFlags, P) &&
+         mapOptOrNull(Params, "clangdFileStatus", Opts.FileStatus, P);
 }
 
 bool fromJSON(const llvm::json::Value &E, TypeHierarchyDirection &Out,
@@ -1190,10 +1205,11 @@ bool fromJSON(const llvm::json::Value &Params, TypeHierarchyItem &I,
   return O && O.map("name", I.name) && O.map("kind", I.kind) &&
          O.map("uri", I.uri) && O.map("range", I.range) &&
          O.map("selectionRange", I.selectionRange) &&
-         O.mapOptional("detail", I.detail) &&
-         O.mapOptional("deprecated", I.deprecated) &&
-         O.mapOptional("parents", I.parents) &&
-         O.mapOptional("children", I.children) && O.mapOptional("data", I.data);
+         mapOptOrNull(Params, "detail", I.detail, P) &&
+         mapOptOrNull(Params, "deprecated", I.deprecated, P) &&
+         mapOptOrNull(Params, "parents", I.parents, P) &&
+         mapOptOrNull(Params, "children", I.children, P) &&
+         mapOptOrNull(Params, "data", I.data, P);
 }
 
 bool fromJSON(const llvm::json::Value &Params,
@@ -1238,7 +1254,7 @@ bool fromJSON(const llvm::json::Value &Params, CallHierarchyItem &I,
   return O && O.map("name", I.name) && O.map("kind", I.kind) &&
          O.map("uri", I.uri) && O.map("range", I.range) &&
          O.map("selectionRange", I.selectionRange) &&
-         O.mapOptional("data", I.data);
+         mapOptOrNull(Params, "data", I.data, P);
 }
 
 bool fromJSON(const llvm::json::Value &Params,

From b1106a5b3bc94f6da11682007d101823f81bad30 Mon Sep 17 00:00:00 2001
From: Simonas Kazlauskas <git@kazlauskas.me>
Date: Tue, 16 Feb 2021 13:35:32 -0800
Subject: [PATCH 133/318] [llvm-dwp] Join dwo paths correctly when DWOPath is
 absolute

When the `DWOPath` is absolute, we want to use `DWOPath` as is, without prepending any other
components to the path. The `sys::path::append` does not join, but rather unconditionally appends
the paths, so something like `sys::path::append("/tmp", "/tmp/banana")` will result in
`/tmp/tmp/banana` rather than the desired `/tmp/banana`.

This then causes `llvm-dwp` to fail in a following situation:

```
$ clang -gsplit-dwarf /tmp/banana/test.c -c -o /tmp/outdir/foo.o
$ clang outdir/foo.o -o outdir/hm
$ llvm-dwarfdump outdir/hm | grep -C2 foo.dwo
                  DW_AT_comp_dir    ("/tmp")
                  DW_AT_GNU_pubnames  (true)
                  DW_AT_GNU_dwo_name    ("/tmp/outdir/foo.dwo")
                                DW_AT_GNU_dwo_id    (0xde4d396f3bf0e257)
                  DW_AT_low_pc  (0x0000000000401100)
$ strace -o trace llvm-dwp -e outdir/hm -o outdir/hm.dwp
error: No such file or directory
$ cat trace | grep foo.dwo
openat(AT_FDCWD, "/tmp/tmp/outdir/foo.dwo", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
```

Reviewed By: dblaikie

Differential Revision: https://reviews.llvm.org/D96678

(cherry picked from commit 6ffcb2937c96bd0d7a55b984b5eb8f381b68e322)
---
 .../tools/llvm-dwp/X86/absolute_paths.test    | 37 +++++++++++++++++++
 llvm/tools/llvm-dwp/llvm-dwp.cpp              |  4 +-
 2 files changed, 39 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/tools/llvm-dwp/X86/absolute_paths.test

diff --git a/llvm/test/tools/llvm-dwp/X86/absolute_paths.test b/llvm/test/tools/llvm-dwp/X86/absolute_paths.test
new file mode 100644
index 000000000000..1e3d27e7323b
--- /dev/null
+++ b/llvm/test/tools/llvm-dwp/X86/absolute_paths.test
@@ -0,0 +1,37 @@
+; RUN: rm -rf %t
+; RUN: mkdir -p %t
+; RUN: llc %s -mtriple=x86_64-linux --split-dwarf-file=%t/test.dwo --split-dwarf-output=%t/test.dwo --filetype=obj -o %t/test.o
+; RUN: llvm-dwarfdump -v %t/test.dwo | FileCheck %s -DPATH=%t
+; RUN: llvm-dwp -e %t/test.o -o %t/test.dwp
+; RUN: llvm-dwarfdump -v %t/test.dwp | FileCheck %s -DPATH=%t
+
+; CHECK-LABEL: .debug_abbrev.dwo contents:
+; CHECK: DW_AT_name
+; CHECK: DW_AT_GNU_dwo_name
+; CHECK: DW_AT_name
+; CHECK-LABEL: .debug_str.dwo contents:
+; CHECK: "banana"
+; CHECK: "/tmp/test.c"
+; CHECK: "[[PATH]]/test.dwo"
+
+define void @banana() !dbg !8 {
+  ret void, !dbg !12
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6}
+!llvm.ident = !{!7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 11.0.1", isOptimized: true, runtimeVersion: 0, splitDebugFilename: "test.dwo", emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: GNU)
+!1 = !DIFile(filename: "/tmp/test.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 7, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{i32 7, !"PIC Level", i32 2}
+!7 = !{!"clang version 11.0.1"}
+!8 = distinct !DISubprogram(name: "banana", scope: !9, file: !9, line: 1, type: !10, scopeLine: 1, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!9 = !DIFile(filename: "test.c", directory: "/tmp")
+!10 = !DISubroutineType(types: !11)
+!11 = !{null}
+!12 = !DILocation(line: 1, column: 20, scope: !8)
diff --git a/llvm/tools/llvm-dwp/llvm-dwp.cpp b/llvm/tools/llvm-dwp/llvm-dwp.cpp
index 9aed3526b0aa..d495bd3d4cab 100644
--- a/llvm/tools/llvm-dwp/llvm-dwp.cpp
+++ b/llvm/tools/llvm-dwp/llvm-dwp.cpp
@@ -526,8 +526,8 @@ getDWOFilenames(StringRef ExecFilename) {
     std::string DWOCompDir =
         dwarf::toString(Die.find(dwarf::DW_AT_comp_dir), "");
     if (!DWOCompDir.empty()) {
-      SmallString<16> DWOPath;
-      sys::path::append(DWOPath, DWOCompDir, DWOName);
+      SmallString<16> DWOPath(std::move(DWOName));
+      sys::fs::make_absolute(DWOCompDir, DWOPath);
       DWOPaths.emplace_back(DWOPath.data(), DWOPath.size());
     } else {
       DWOPaths.push_back(std::move(DWOName));

From 0d4f8a3f394f55b5fde7033bf009e5dacea1a775 Mon Sep 17 00:00:00 2001
From: Georgii Rymar <grimar@accesssoftek.com>
Date: Thu, 28 Jan 2021 16:35:18 +0300
Subject: [PATCH 134/318] [llvm-symbolizer] - Fix the crash in GNU output style
 with --no-inlines and missing input file.

Fixes https://bugs.llvm.org/show_bug.cgi?id=48882.

If the input file does not exist (or has a reading error), the
following code will crash if there are two or more input addresses.

```
auto ResOrErr = Symbolizer.symbolizeInlinedCode(
  ModuleName, {Offset, object::SectionedAddress::UndefSection});
Printer << (error(ResOrErr) ? DILineInfo() : ResOrErr.get().getFrame(0));
```

For the first address, `symbolizeInlinedCode` returns an error.
For the second address, `symbolizeInlinedCode` returns an empty result
(not an error) and `.getFrame(0)` will crash.

Differential revision: https://reviews.llvm.org/D95609

(cherry picked from commit d22140687500f90830fe416d9c1e317f7c4535d5)
---
 .../llvm-symbolizer/output-style-inlined.test | 21 +++++++++++++++++++
 .../tools/llvm-symbolizer/llvm-symbolizer.cpp |  7 ++++++-
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/llvm/test/tools/llvm-symbolizer/output-style-inlined.test b/llvm/test/tools/llvm-symbolizer/output-style-inlined.test
index 7e9f7e7ce180..1b8e3a2f22fb 100644
--- a/llvm/test/tools/llvm-symbolizer/output-style-inlined.test
+++ b/llvm/test/tools/llvm-symbolizer/output-style-inlined.test
@@ -28,3 +28,24 @@ RUN:   | FileCheck %s --check-prefix=LLVM --implicit-check-not=inctwo
 
 LLVM: main
 GNU: inctwo
+
+## Check that we are able to produce an output properly when the --no-inlines option
+## is specified, but a file doesn't exist. Check we report an error.
+
+RUN: llvm-symbolizer --output-style=GNU --obj=%p/Inputs/not.exist 0x1 0x2 --no-inlines 2>&1 \
+RUN:   | FileCheck %s --check-prefix=NOT-EXIST-GNU -DMSG=%errc_ENOENT
+RUN: llvm-symbolizer --output-style=LLVM --obj=%p/Inputs/not.exist 0x1 0x2 --no-inlines 2>&1 \
+RUN:   | FileCheck %s --check-prefix=NOT-EXIST-LLVM -DMSG=%errc_ENOENT
+
+# NOT-EXIST-GNU:      LLVMSymbolizer: error reading file: [[MSG]]
+# NOT-EXIST-GNU-NEXT: ??
+# NOT-EXIST-GNU-NEXT: ??:0
+# NOT-EXIST-GNU-NEXT: ??
+# NOT-EXIST-GNU-NEXT: ??:0
+
+# NOT-EXIST-LLVM:       LLVMSymbolizer: error reading file: [[MSG]]
+# NOT-EXIST-LLVM-NEXT:  ??
+# NOT-EXIST-LLVM-NEXT:  ??:0:0
+# NOT-EXIST-LLVM-EMPTY:
+# NOT-EXIST-LLVM-NEXT:  ??
+# NOT-EXIST-LLVM-NEXT:  ??:0:0
diff --git a/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp b/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp
index 9c68acee0ae2..8734c2d74045 100644
--- a/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp
+++ b/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp
@@ -181,7 +181,12 @@ static void symbolizeInput(const opt::InputArgList &Args, uint64_t AdjustVMA,
     // the topmost function, which suits our needs better.
     auto ResOrErr = Symbolizer.symbolizeInlinedCode(
         ModuleName, {Offset, object::SectionedAddress::UndefSection});
-    Printer << (error(ResOrErr) ? DILineInfo() : ResOrErr.get().getFrame(0));
+    if (!ResOrErr || ResOrErr->getNumberOfFrames() == 0) {
+      error(ResOrErr);
+      Printer << DILineInfo();
+    } else {
+      Printer << ResOrErr->getFrame(0);
+    }
   } else {
     auto ResOrErr = Symbolizer.symbolizeCode(
         ModuleName, {Offset, object::SectionedAddress::UndefSection});

From d3f9f512a47f10d27a9e32edaaa7513a64b0ec17 Mon Sep 17 00:00:00 2001
From: "William S. Moses" <gh@wsmoses.com>
Date: Mon, 1 Feb 2021 18:16:17 -0500
Subject: [PATCH 135/318] [SROA] Propagate correct TBAA/TBAA Struct offsets

SROA does not correctly account for offsets in TBAA/TBAA struct metadata.
This patch creates functionality for generating new MD with the corresponding
offset and updates SROA to use this functionality.

Differential Revision: https://reviews.llvm.org/D95826

(cherry picked from commit 40862b1a7486a969ff044cd240aad24f4183cc10)
---
 llvm/include/llvm/IR/Metadata.h              |  18 +++
 llvm/include/llvm/IR/Operator.h              |   5 +
 llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp |  81 ++++++++++++
 llvm/lib/IR/Operator.cpp                     |  20 ++-
 llvm/lib/Transforms/Scalar/SROA.cpp          |  38 ++++--
 llvm/test/Transforms/SROA/basictest.ll       | 128 +++++++++++--------
 llvm/test/Transforms/SROA/tbaa-struct2.ll    |  51 ++++++++
 7 files changed, 269 insertions(+), 72 deletions(-)
 create mode 100644 llvm/test/Transforms/SROA/tbaa-struct2.ll

diff --git a/llvm/include/llvm/IR/Metadata.h b/llvm/include/llvm/IR/Metadata.h
index 0b87416befe9..9a4480b75a30 100644
--- a/llvm/include/llvm/IR/Metadata.h
+++ b/llvm/include/llvm/IR/Metadata.h
@@ -667,6 +667,12 @@ struct AAMDNodes {
   /// The tag specifying the noalias scope.
   MDNode *NoAlias = nullptr;
 
+  // Shift tbaa Metadata node to start off bytes later
+  static MDNode *ShiftTBAA(MDNode *M, size_t off);
+
+  // Shift tbaa.struct Metadata node to start off bytes later
+  static MDNode *ShiftTBAAStruct(MDNode *M, size_t off);
+
   /// Given two sets of AAMDNodes that apply to the same pointer,
   /// give the best AAMDNodes that are compatible with both (i.e. a set of
   /// nodes whose allowable aliasing conclusions are a subset of those
@@ -680,6 +686,18 @@ struct AAMDNodes {
     Result.NoAlias = Other.NoAlias == NoAlias ? NoAlias : nullptr;
     return Result;
   }
+
+  /// Create a new AAMDNode that describes this AAMDNode after applying a
+  /// constant offset to the start of the pointer
+  AAMDNodes shift(size_t Offset) {
+    AAMDNodes Result;
+    Result.TBAA = TBAA ? ShiftTBAA(TBAA, Offset) : nullptr;
+    Result.TBAAStruct =
+        TBAAStruct ? ShiftTBAAStruct(TBAAStruct, Offset) : nullptr;
+    Result.Scope = Scope;
+    Result.NoAlias = NoAlias;
+    return Result;
+  }
 };
 
 // Specialize DenseMapInfo for AAMDNodes.
diff --git a/llvm/include/llvm/IR/Operator.h b/llvm/include/llvm/IR/Operator.h
index acfacbd6c74e..945f7e46e142 100644
--- a/llvm/include/llvm/IR/Operator.h
+++ b/llvm/include/llvm/IR/Operator.h
@@ -568,6 +568,11 @@ class GEPOperator
   bool accumulateConstantOffset(
       const DataLayout &DL, APInt &Offset,
       function_ref<bool(Value &, APInt &)> ExternalAnalysis = nullptr) const;
+
+  static bool accumulateConstantOffset(
+      Type *SourceType, ArrayRef<const Value *> Index, const DataLayout &DL,
+      APInt &Offset,
+      function_ref<bool(Value &, APInt &)> ExternalAnalysis = nullptr);
 };
 
 class PtrToIntOperator
diff --git a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
index 7d97fc5da9b0..268acb682cf1 100644
--- a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -737,3 +737,84 @@ bool TypeBasedAAWrapperPass::doFinalization(Module &M) {
 void TypeBasedAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
 }
+
+MDNode *AAMDNodes::ShiftTBAA(MDNode *MD, size_t Offset) {
+  // Fast path if there's no offset
+  if (Offset == 0)
+    return MD;
+  // Fast path if there's no path tbaa node (and thus scalar)
+  if (!isStructPathTBAA(MD))
+    return MD;
+
+  TBAAStructTagNode Tag(MD);
+  SmallVector<Metadata *, 5> Sub;
+  Sub.push_back(MD->getOperand(0));
+  Sub.push_back(MD->getOperand(1));
+  ConstantInt *InnerOffset = mdconst::extract<ConstantInt>(MD->getOperand(2));
+
+  if (Tag.isNewFormat()) {
+    ConstantInt *InnerSize = mdconst::extract<ConstantInt>(MD->getOperand(3));
+
+    if (InnerOffset->getZExtValue() + InnerSize->getZExtValue() <= Offset) {
+      return nullptr;
+    }
+
+    uint64_t NewSize = InnerSize->getZExtValue();
+    uint64_t NewOffset = InnerOffset->getZExtValue() - Offset;
+    if (InnerOffset->getZExtValue() < Offset) {
+      NewOffset = 0;
+      NewSize -= Offset - InnerOffset->getZExtValue();
+    }
+
+    Sub.push_back(ConstantAsMetadata::get(
+        ConstantInt::get(InnerOffset->getType(), NewOffset)));
+
+    Sub.push_back(ConstantAsMetadata::get(
+        ConstantInt::get(InnerSize->getType(), NewSize)));
+
+    // immutable type
+    if (MD->getNumOperands() >= 5)
+      Sub.push_back(MD->getOperand(4));
+  } else {
+    if (InnerOffset->getZExtValue() < Offset)
+      return nullptr;
+
+    Sub.push_back(ConstantAsMetadata::get(ConstantInt::get(
+        InnerOffset->getType(), InnerOffset->getZExtValue() - Offset)));
+
+    // immutable type
+    if (MD->getNumOperands() >= 4)
+      Sub.push_back(MD->getOperand(3));
+  }
+  return MDNode::get(MD->getContext(), Sub);
+}
+
+MDNode *AAMDNodes::ShiftTBAAStruct(MDNode *MD, size_t Offset) {
+  // Fast path if there's no offset
+  if (Offset == 0)
+    return MD;
+  SmallVector<Metadata *, 3> Sub;
+  for (size_t i = 0, size = MD->getNumOperands(); i < size; i += 3) {
+    ConstantInt *InnerOffset = mdconst::extract<ConstantInt>(MD->getOperand(i));
+    ConstantInt *InnerSize =
+        mdconst::extract<ConstantInt>(MD->getOperand(i + 1));
+    // Don't include any triples that aren't in bounds
+    if (InnerOffset->getZExtValue() + InnerSize->getZExtValue() <= Offset)
+      continue;
+
+    uint64_t NewSize = InnerSize->getZExtValue();
+    uint64_t NewOffset = InnerOffset->getZExtValue() - Offset;
+    if (InnerOffset->getZExtValue() < Offset) {
+      NewOffset = 0;
+      NewSize -= Offset - InnerOffset->getZExtValue();
+    }
+
+    // Shift the offset of the triple
+    Sub.push_back(ConstantAsMetadata::get(
+        ConstantInt::get(InnerOffset->getType(), NewOffset)));
+    Sub.push_back(ConstantAsMetadata::get(
+        ConstantInt::get(InnerSize->getType(), NewSize)));
+    Sub.push_back(MD->getOperand(i + 2));
+  }
+  return MDNode::get(MD->getContext(), Sub);
+}
\ No newline at end of file
diff --git a/llvm/lib/IR/Operator.cpp b/llvm/lib/IR/Operator.cpp
index 0f70fc37dee2..69181f35827b 100644
--- a/llvm/lib/IR/Operator.cpp
+++ b/llvm/lib/IR/Operator.cpp
@@ -61,10 +61,17 @@ Align GEPOperator::getMaxPreservedAlignment(const DataLayout &DL) const {
 bool GEPOperator::accumulateConstantOffset(
     const DataLayout &DL, APInt &Offset,
     function_ref<bool(Value &, APInt &)> ExternalAnalysis) const {
-   assert(Offset.getBitWidth() ==
-              DL.getIndexSizeInBits(getPointerAddressSpace()) &&
-          "The offset bit width does not match DL specification.");
+  assert(Offset.getBitWidth() ==
+             DL.getIndexSizeInBits(getPointerAddressSpace()) &&
+         "The offset bit width does not match DL specification.");
+  SmallVector<const Value *> Index(value_op_begin() + 1, value_op_end());
+  return GEPOperator::accumulateConstantOffset(getSourceElementType(), Index,
+                                               DL, Offset, ExternalAnalysis);
+}
 
+bool GEPOperator::accumulateConstantOffset(
+    Type *SourceType, ArrayRef<const Value *> Index, const DataLayout &DL,
+    APInt &Offset, function_ref<bool(Value &, APInt &)> ExternalAnalysis) {
   bool UsedExternalAnalysis = false;
   auto AccumulateOffset = [&](APInt Index, uint64_t Size) -> bool {
     Index = Index.sextOrTrunc(Offset.getBitWidth());
@@ -85,9 +92,10 @@ bool GEPOperator::accumulateConstantOffset(
     }
     return true;
   };
-
-  for (gep_type_iterator GTI = gep_type_begin(this), GTE = gep_type_end(this);
-       GTI != GTE; ++GTI) {
+  auto begin = generic_gep_type_iterator<decltype(Index.begin())>::begin(
+      SourceType, Index.begin());
+  auto end = generic_gep_type_iterator<decltype(Index.end())>::end(Index.end());
+  for (auto GTI = begin, GTE = end; GTI != GTE; ++GTI) {
     // Scalable vectors are multiplied by a runtime constant.
     bool ScalableType = false;
     if (isa<ScalableVectorType>(GTI.getIndexedType()))
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index d111a6ba4241..af510f1a84bf 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -2524,7 +2524,7 @@ class llvm::sroa::AllocaSliceRewriter
                                               NewAI.getAlign(), LI.isVolatile(),
                                               LI.getName());
       if (AATags)
-        NewLI->setAAMetadata(AATags);
+        NewLI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
       if (LI.isVolatile())
         NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
       if (NewLI->isAtomic())
@@ -2563,7 +2563,7 @@ class llvm::sroa::AllocaSliceRewriter
           IRB.CreateAlignedLoad(TargetTy, getNewAllocaSlicePtr(IRB, LTy),
                                 getSliceAlign(), LI.isVolatile(), LI.getName());
       if (AATags)
-        NewLI->setAAMetadata(AATags);
+        NewLI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
       if (LI.isVolatile())
         NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
 
@@ -2626,7 +2626,7 @@ class llvm::sroa::AllocaSliceRewriter
     }
     StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
     if (AATags)
-      Store->setAAMetadata(AATags);
+      Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
     Pass.DeadInsts.push_back(&SI);
 
     LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n");
@@ -2650,7 +2650,7 @@ class llvm::sroa::AllocaSliceRewriter
     Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
                              LLVMContext::MD_access_group});
     if (AATags)
-      Store->setAAMetadata(AATags);
+      Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
     Pass.DeadInsts.push_back(&SI);
     LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n");
     return true;
@@ -2720,7 +2720,7 @@ class llvm::sroa::AllocaSliceRewriter
     NewSI->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
                              LLVMContext::MD_access_group});
     if (AATags)
-      NewSI->setAAMetadata(AATags);
+      NewSI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
     if (SI.isVolatile())
       NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
     if (NewSI->isAtomic())
@@ -2816,7 +2816,7 @@ class llvm::sroa::AllocaSliceRewriter
           getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
           MaybeAlign(getSliceAlign()), II.isVolatile());
       if (AATags)
-        New->setAAMetadata(AATags);
+        New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
       LLVM_DEBUG(dbgs() << "          to: " << *New << "\n");
       return false;
     }
@@ -2885,7 +2885,7 @@ class llvm::sroa::AllocaSliceRewriter
     StoreInst *New =
         IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), II.isVolatile());
     if (AATags)
-      New->setAAMetadata(AATags);
+      New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
     LLVM_DEBUG(dbgs() << "          to: " << *New << "\n");
     return !II.isVolatile();
   }
@@ -3006,7 +3006,7 @@ class llvm::sroa::AllocaSliceRewriter
       CallInst *New = IRB.CreateMemCpy(DestPtr, DestAlign, SrcPtr, SrcAlign,
                                        Size, II.isVolatile());
       if (AATags)
-        New->setAAMetadata(AATags);
+        New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
       LLVM_DEBUG(dbgs() << "          to: " << *New << "\n");
       return false;
     }
@@ -3060,7 +3060,7 @@ class llvm::sroa::AllocaSliceRewriter
       LoadInst *Load = IRB.CreateAlignedLoad(OtherTy, SrcPtr, SrcAlign,
                                              II.isVolatile(), "copyload");
       if (AATags)
-        Load->setAAMetadata(AATags);
+        Load->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
       Src = Load;
     }
 
@@ -3080,7 +3080,7 @@ class llvm::sroa::AllocaSliceRewriter
     StoreInst *Store = cast<StoreInst>(
         IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
     if (AATags)
-      Store->setAAMetadata(AATags);
+      Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
     LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n");
     return !II.isVolatile();
   }
@@ -3381,8 +3381,13 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
           IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
       LoadInst *Load =
           IRB.CreateAlignedLoad(Ty, GEP, Alignment, Name + ".load");
-      if (AATags)
-        Load->setAAMetadata(AATags);
+
+      APInt Offset(
+          DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
+      if (AATags &&
+          GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset))
+        Load->setAAMetadata(AATags.shift(Offset.getZExtValue()));
+
       Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
       LLVM_DEBUG(dbgs() << "          to: " << *Load << "\n");
     }
@@ -3428,8 +3433,13 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
           IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
       StoreInst *Store =
           IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Alignment);
-      if (AATags)
-        Store->setAAMetadata(AATags);
+
+      APInt Offset(
+          DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
+      if (AATags &&
+          GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset))
+        Store->setAAMetadata(AATags.shift(Offset.getZExtValue()));
+
       LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n");
     }
   };
diff --git a/llvm/test/Transforms/SROA/basictest.ll b/llvm/test/Transforms/SROA/basictest.ll
index fea4f995a57e..d15d01e096f6 100644
--- a/llvm/test/Transforms/SROA/basictest.ll
+++ b/llvm/test/Transforms/SROA/basictest.ll
@@ -145,27 +145,28 @@ entry:
 ; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [42 x i8], [42 x i8]* %[[test3_a1]], i64 0, i64 0
 ; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 8 %src, i32 42, {{.*}}), !tbaa [[TAG_0:!.*]]
 ; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8, i8* %src, i64 42
-; CHECK-NEXT: %[[test3_r1:.*]] = load i8, i8* %[[gep]], {{.*}}, !tbaa [[TAG_0]]
+; CHECK-NEXT: %[[test3_r1:.*]] = load i8, i8* %[[gep]], {{.*}}, !tbaa [[TAG_0_M42:!.*]]
 ; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8, i8* %src, i64 43
 ; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [99 x i8], [99 x i8]* %[[test3_a2]], i64 0, i64 0
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 99, {{.*}}), !tbaa [[TAG_0:!.*]]
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 99, {{.*}}), !tbaa [[TAG_0_M43:!.*]]
 ; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8, i8* %src, i64 142
 ; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [16 x i8], [16 x i8]* %[[test3_a3]], i64 0, i64 0
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 2 %[[gep_src]], i32 16, {{.*}}), !tbaa [[TAG_0:!.*]]
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 2 %[[gep_src]], i32 16, {{.*}}), !tbaa [[TAG_0_M142:!.*]]
 ; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8, i8* %src, i64 158
 ; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [42 x i8], [42 x i8]* %[[test3_a4]], i64 0, i64 0
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 2 %[[gep_src]], i32 42, {{.*}}), !tbaa [[TAG_0:!.*]]
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 2 %[[gep_src]], i32 42, {{.*}}), !tbaa [[TAG_0_M158:!.*]]
 ; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8, i8* %src, i64 200
 ; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8], [7 x i8]* %[[test3_a5]], i64 0, i64 0
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 8 %[[gep_src]], i32 7, {{.*}}), !tbaa [[TAG_0:!.*]]
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 8 %[[gep_src]], i32 7, {{.*}})
 ; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8, i8* %src, i64 207
-; CHECK-NEXT: %[[test3_r2:.*]] = load i8, i8* %[[gep]], {{.*}}, !tbaa [[TAG_0]]
+; CHECK-NOT:  %[[bad_test3_r2:.*]] = load i8, i8* %[[gep]], {{.*}}, !tbaa
+; CHECK-NEXT: %[[test3_r2:.*]] = load i8, i8* %[[gep]], {{.*}}
 ; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8, i8* %src, i64 208
 ; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8], [7 x i8]* %[[test3_a6]], i64 0, i64 0
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 8 %[[gep_src]], i32 7, {{.*}}), !tbaa [[TAG_0:!.*]]
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 8 %[[gep_src]], i32 7, {{.*}})
 ; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8, i8* %src, i64 215
 ; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [85 x i8], [85 x i8]* %[[test3_a7]], i64 0, i64 0
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 85, {{.*}}), !tbaa [[TAG_0:!.*]]
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 85, {{.*}})
 
   ; Clobber a single element of the array, this should be promotable, and be deleted.
   %c = getelementptr [300 x i8], [300 x i8]* %a, i64 0, i64 42
@@ -310,7 +311,7 @@ entry:
 ; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %src, i32 3, {{.*}}), !tbaa [[TAG_51:!.*]]
 ; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8, i8* %src, i64 3
 ; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8], [7 x i8]* %[[test3_a5]], i64 0, i64 0
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 5, {{.*}}), !tbaa [[TAG_51]]
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 5, {{.*}})
 
   ; Bridge between the overlapping areas
   call void @llvm.memset.p0i8.i32(i8* %overlap2.1.2.i8, i8 42, i32 8, i1 false), !tbaa !53
@@ -318,7 +319,7 @@ entry:
 ; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 1 %[[gep]], i8 42, i32 5, {{.*}}), !tbaa [[TAG_53:!.*]]
 ; ...promoted i8 store...
 ; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8], [7 x i8]* %[[test3_a6]], i64 0, i64 0
-; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 1 %[[gep]], i8 42, i32 2, {{.*}}), !tbaa [[TAG_53]]
+; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 1 %[[gep]], i8 42, i32 2, {{.*}})
 
   ; Entirely within the second overlap.
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %overlap2.2.1.i8, i8* %src, i32 5, i1 false), !tbaa !55
@@ -331,33 +332,33 @@ entry:
 ; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep]], i8* align 1 %src, i32 5, {{.*}}), !tbaa [[TAG_57:!.*]]
 ; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8, i8* %src, i64 5
 ; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [85 x i8], [85 x i8]* %[[test3_a7]], i64 0, i64 0
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 3, {{.*}}), !tbaa [[TAG_57]]
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 3, {{.*}})
 
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %b, i32 300, i1 false), !tbaa !59
 ; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [42 x i8], [42 x i8]* %[[test3_a1]], i64 0, i64 0
 ; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %dst, i8* align 1 %[[gep]], i32 42, {{.*}}), !tbaa [[TAG_59:!.*]]
 ; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8, i8* %dst, i64 42
-; CHECK-NEXT: store i8 0, i8* %[[gep]], {{.*}}, !tbaa [[TAG_59]]
+; CHECK-NEXT: store i8 0, i8* %[[gep]], {{.*}}
 ; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8, i8* %dst, i64 43
 ; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [99 x i8], [99 x i8]* %[[test3_a2]], i64 0, i64 0
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 99, {{.*}}), !tbaa [[TAG_59]]
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 99, {{.*}})
 ; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8, i8* %dst, i64 142
 ; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [16 x i8], [16 x i8]* %[[test3_a3]], i64 0, i64 0
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 16, {{.*}}), !tbaa [[TAG_59]]
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 16, {{.*}})
 ; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8, i8* %dst, i64 158
 ; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [42 x i8], [42 x i8]* %[[test3_a4]], i64 0, i64 0
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 42, {{.*}}), !tbaa [[TAG_59]]
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 42, {{.*}})
 ; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8, i8* %dst, i64 200
 ; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8], [7 x i8]* %[[test3_a5]], i64 0, i64 0
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 7, {{.*}}), !tbaa [[TAG_59]]
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 7, {{.*}})
 ; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8, i8* %dst, i64 207
-; CHECK-NEXT: store i8 42, i8* %[[gep]], {{.*}}, !tbaa [[TAG_59]]
+; CHECK-NEXT: store i8 42, i8* %[[gep]], {{.*}}
 ; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8, i8* %dst, i64 208
 ; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8], [7 x i8]* %[[test3_a6]], i64 0, i64 0
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 7, {{.*}}), !tbaa [[TAG_59]]
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 7, {{.*}})
 ; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8, i8* %dst, i64 215
 ; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [85 x i8], [85 x i8]* %[[test3_a7]], i64 0, i64 0
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 85, {{.*}}), !tbaa [[TAG_59]]
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 85, {{.*}})
 
   ret void
 }
@@ -381,41 +382,41 @@ entry:
 ; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep]], i8* align 1 %src, i32 20, {{.*}}), !tbaa [[TAG_0]]
 ; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8, i8* %src, i64 20
 ; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
-; CHECK-NEXT: %[[test4_r1:.*]] = load i16, i16* %[[bitcast]], {{.*}}, !tbaa [[TAG_0]]
+; CHECK-NEXT: %[[test4_r1:.*]] = load i16, i16* %[[bitcast]], {{.*}}, !tbaa [[TAG_0_M20:!.*]]
 ; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8, i8* %src, i64 22
-; CHECK-NEXT: %[[test4_r2:.*]] = load i8, i8* %[[gep]], {{.*}}, !tbaa [[TAG_0]]
+; CHECK-NEXT: %[[test4_r2:.*]] = load i8, i8* %[[gep]], {{.*}}, !tbaa [[TAG_0_M22:!.*]]
 ; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8, i8* %src, i64 23
 ; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8], [7 x i8]* %[[test4_a2]], i64 0, i64 0
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 7, {{.*}}), !tbaa [[TAG_0]]
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 7, {{.*}}), !tbaa [[TAG_0_M23:!.*]]
 ; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8, i8* %src, i64 30
 ; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [10 x i8], [10 x i8]* %[[test4_a3]], i64 0, i64 0
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 10, {{.*}}), !tbaa [[TAG_0]]
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 10, {{.*}}), !tbaa [[TAG_0_M30:!.*]]
 ; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8, i8* %src, i64 40
 ; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
-; CHECK-NEXT: %[[test4_r3:.*]] = load i16, i16* %[[bitcast]], {{.*}}, !tbaa [[TAG_0]]
+; CHECK-NEXT: %[[test4_r3:.*]] = load i16, i16* %[[bitcast]], {{.*}}, !tbaa [[TAG_0_M40:!.*]]
 ; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8, i8* %src, i64 42
-; CHECK-NEXT: %[[test4_r4:.*]] = load i8, i8* %[[gep]], {{.*}}, !tbaa [[TAG_0]]
+; CHECK-NEXT: %[[test4_r4:.*]] = load i8, i8* %[[gep]], {{.*}}, !tbaa [[TAG_0_M42]]
 ; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8, i8* %src, i64 43
 ; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8], [7 x i8]* %[[test4_a4]], i64 0, i64 0
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 7, {{.*}}), !tbaa [[TAG_0]]
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 7, {{.*}}), !tbaa [[TAG_0_M43]]
 ; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8, i8* %src, i64 50
 ; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
-; CHECK-NEXT: %[[test4_r5:.*]] = load i16, i16* %[[bitcast]], {{.*}}, !tbaa [[TAG_0]]
+; CHECK-NEXT: %[[test4_r5:.*]] = load i16, i16* %[[bitcast]], {{.*}}, !tbaa [[TAG_0_M50:!.*]]
 ; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8, i8* %src, i64 52
-; CHECK-NEXT: %[[test4_r6:.*]] = load i8, i8* %[[gep]], {{.*}}, !tbaa [[TAG_0]]
+; CHECK-NEXT: %[[test4_r6:.*]] = load i8, i8* %[[gep]], {{.*}}, !tbaa [[TAG_0_M52:!.*]]
 ; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8, i8* %src, i64 53
 ; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8], [7 x i8]* %[[test4_a5]], i64 0, i64 0
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 7, {{.*}}), !tbaa [[TAG_0]]
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 7, {{.*}}), !tbaa [[TAG_0_M53:!.+]]
 ; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8, i8* %src, i64 60
 ; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [40 x i8], [40 x i8]* %[[test4_a6]], i64 0, i64 0
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 40, {{.*}}), !tbaa [[TAG_0]]
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 40, {{.*}}), !tbaa [[TAG_0_M60:!.+]]
 
   %a.src.1 = getelementptr [100 x i8], [100 x i8]* %a, i64 0, i64 20
   %a.dst.1 = getelementptr [100 x i8], [100 x i8]* %a, i64 0, i64 40
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.dst.1, i8* %a.src.1, i32 10, i1 false), !tbaa !3
 ; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8], [7 x i8]* %[[test4_a4]], i64 0, i64 0
 ; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8], [7 x i8]* %[[test4_a2]], i64 0, i64 0
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 7, {{.*}}), !tbaa [[TAG_3]]
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 7, {{.*}})
 
   ; Clobber a single element of the array, this should be promotable, and be deleted.
   %c = getelementptr [100 x i8], [100 x i8]* %a, i64 0, i64 42
@@ -425,41 +426,41 @@ entry:
   call void @llvm.memmove.p0i8.p0i8.i32(i8* %a.dst.1, i8* %a.src.2, i32 10, i1 false), !tbaa !5
 ; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8], [7 x i8]* %[[test4_a4]], i64 0, i64 0
 ; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8], [7 x i8]* %[[test4_a5]], i64 0, i64 0
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 7, {{.*}}), !tbaa [[TAG_5]]
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 7, {{.*}})
 
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %b, i32 100, i1 false), !tbaa !7
 ; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [20 x i8], [20 x i8]* %[[test4_a1]], i64 0, i64 0
 ; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %dst, i8* align 1 %[[gep]], i32 20, {{.*}}), !tbaa [[TAG_7]]
 ; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8, i8* %dst, i64 20
 ; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
-; CHECK-NEXT: store i16 %[[test4_r1]], i16* %[[bitcast]], {{.*}}, !tbaa [[TAG_7]]
+; CHECK-NEXT: store i16 %[[test4_r1]], i16* %[[bitcast]], {{.*}}
 ; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8, i8* %dst, i64 22
-; CHECK-NEXT: store i8 %[[test4_r2]], i8* %[[gep]], {{.*}}, !tbaa [[TAG_7]]
+; CHECK-NEXT: store i8 %[[test4_r2]], i8* %[[gep]], {{.*}}
 ; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8, i8* %dst, i64 23
 ; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8], [7 x i8]* %[[test4_a2]], i64 0, i64 0
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 7, {{.*}}), !tbaa [[TAG_7]]
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 7, {{.*}})
 ; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8, i8* %dst, i64 30
 ; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [10 x i8], [10 x i8]* %[[test4_a3]], i64 0, i64 0
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 10, {{.*}}), !tbaa [[TAG_7]]
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 10, {{.*}})
 ; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8, i8* %dst, i64 40
 ; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
-; CHECK-NEXT: store i16 %[[test4_r5]], i16* %[[bitcast]], {{.*}}, !tbaa [[TAG_7]]
+; CHECK-NEXT: store i16 %[[test4_r5]], i16* %[[bitcast]], {{.*}}
 ; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8, i8* %dst, i64 42
-; CHECK-NEXT: store i8 %[[test4_r6]], i8* %[[gep]], {{.*}}, !tbaa [[TAG_7]]
+; CHECK-NEXT: store i8 %[[test4_r6]], i8* %[[gep]], {{.*}}
 ; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8, i8* %dst, i64 43
 ; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8], [7 x i8]* %[[test4_a4]], i64 0, i64 0
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 7, {{.*}}), !tbaa [[TAG_7]]
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 7, {{.*}})
 ; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8, i8* %dst, i64 50
 ; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
-; CHECK-NEXT: store i16 %[[test4_r5]], i16* %[[bitcast]], {{.*}}, !tbaa [[TAG_7]]
+; CHECK-NEXT: store i16 %[[test4_r5]], i16* %[[bitcast]], {{.*}}
 ; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8, i8* %dst, i64 52
-; CHECK-NEXT: store i8 %[[test4_r6]], i8* %[[gep]], {{.*}}, !tbaa [[TAG_7]]
+; CHECK-NEXT: store i8 %[[test4_r6]], i8* %[[gep]], {{.*}}
 ; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8, i8* %dst, i64 53
 ; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8], [7 x i8]* %[[test4_a5]], i64 0, i64 0
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 7, {{.*}}), !tbaa [[TAG_7]]
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 7, {{.*}})
 ; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8, i8* %dst, i64 60
 ; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [40 x i8], [40 x i8]* %[[test4_a6]], i64 0, i64 0
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 40, {{.*}}), !tbaa [[TAG_7]]
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[gep_dst]], i8* align 1 %[[gep_src]], i32 40, {{.*}})
 
   ret void
 }
@@ -856,7 +857,7 @@ define void @test18(i8* %src, i8* %dst, i32 %size) {
 ; CHECK:      %[[a:.*]] = alloca [34 x i8]
 ; CHECK:      %[[srcgep1:.*]] = getelementptr inbounds i8, i8* %src, i64 4
 ; CHECK-NEXT: %[[srccast1:.*]] = bitcast i8* %[[srcgep1]] to i32*
-; CHECK-NEXT: %[[srcload:.*]] = load i32, i32* %[[srccast1]], {{.*}}, !tbaa [[TAG_0]]
+; CHECK-NEXT: %[[srcload:.*]] = load i32, i32* %[[srccast1]], {{.*}}, !tbaa [[TAG_0_M4:!.*]]
 ; CHECK-NEXT: %[[agep1:.*]] = getelementptr inbounds [34 x i8], [34 x i8]* %[[a]], i64 0, i64 0
 ; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %[[agep1]], i8* %src, i32 %size, {{.*}}), !tbaa [[TAG_3]]
 ; CHECK-NEXT: %[[agep2:.*]] = getelementptr inbounds [34 x i8], [34 x i8]* %[[a]], i64 0, i64 0
@@ -865,7 +866,7 @@ define void @test18(i8* %src, i8* %dst, i32 %size) {
 ; CHECK-NEXT: store i32 42, i32* %[[dstcast1]], {{.*}}, !tbaa [[TAG_9]]
 ; CHECK-NEXT: %[[dstgep1:.*]] = getelementptr inbounds i8, i8* %dst, i64 4
 ; CHECK-NEXT: %[[dstcast2:.*]] = bitcast i8* %[[dstgep1]] to i32*
-; CHECK-NEXT: store i32 %[[srcload]], i32* %[[dstcast2]], {{.*}}, !tbaa [[TAG_9]]
+; CHECK-NEXT: store i32 %[[srcload]], i32* %[[dstcast2]], {{.*}}
 ; CHECK-NEXT: %[[agep3:.*]] = getelementptr inbounds [34 x i8], [34 x i8]* %[[a]], i64 0, i64 0
 ; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* align 1 %[[agep3]], i32 %size, {{.*}}), !tbaa [[TAG_11]]
 ; CHECK-NEXT: ret void
@@ -1896,7 +1897,7 @@ bb7:
   ret void
 }
 
-!0 = !{!1, !1, i64 0, i64 1}
+!0 = !{!1, !1, i64 0, i64 200}
 !1 = !{!2, i64 1, !"type_0"}
 !2 = !{!"root"}
 !3 = !{!4, !4, i64 0, i64 1}
@@ -1958,14 +1959,39 @@ bb7:
 !59 = !{!60, !60, i64 0, i64 1}
 !60 = !{!2, i64 1, !"type_59"}
 
-; CHECK-DAG: [[TYPE_0:!.*]] = !{{{.*}}, !"type_0"}
-; CHECK-DAG: [[TAG_0]] = !{[[TYPE_0]], [[TYPE_0]], i64 0, i64 1}
+; CHECK-DAG: [[TAG_0]] = !{[[TYPE_0:!.*]], [[TYPE_0]], i64 0, i64 200}
+; CHECK-DAG: [[TYPE_0]] = !{{{.*}}, !"type_0"}
+
+; CHECK-DAG: [[TAG_0_M42]] = !{[[TYPE_0]], [[TYPE_0]], i64 0, i64 158}
+; CHECK-DAG: [[TAG_0_M43]] = !{[[TYPE_0]], [[TYPE_0]], i64 0, i64 157}
+; CHECK-DAG: [[TAG_0_M142]] = !{[[TYPE_0]], [[TYPE_0]], i64 0, i64 58}
+; CHECK-DAG: [[TAG_0_M158]] = !{[[TYPE_0]], [[TYPE_0]], i64 0, i64 42}
+
+; CHECK-DAG: [[TAG_59]] = !{[[TYPE_59:!.*]], [[TYPE_59]], i64 0, i64 1}
+; CHECK-DAG: [[TYPE_59]] = !{{{.*}}, !"type_59"}
+
+; CHECK-DAG: [[TAG_0_M20]] = !{[[TYPE_0]], [[TYPE_0]], i64 0, i64 180}
+; CHECK-DAG: [[TAG_0_M22]] = !{[[TYPE_0]], [[TYPE_0]], i64 0, i64 178}
+; CHECK-DAG: [[TAG_0_M23]] = !{[[TYPE_0]], [[TYPE_0]], i64 0, i64 177}
+; CHECK-DAG: [[TAG_0_M30]] = !{[[TYPE_0]], [[TYPE_0]], i64 0, i64 170}
+; CHECK-DAG: [[TAG_0_M40]] = !{[[TYPE_0]], [[TYPE_0]], i64 0, i64 160}
+; CHECK-DAG: [[TAG_0_M50]] = !{[[TYPE_0]], [[TYPE_0]], i64 0, i64 150}
+; CHECK-DAG: [[TAG_0_M52]] = !{[[TYPE_0]], [[TYPE_0]], i64 0, i64 148}
+; CHECK-DAG: [[TAG_0_M53]] = !{[[TYPE_0]], [[TYPE_0]], i64 0, i64 147}
+; CHECK-DAG: [[TAG_0_M60]] = !{[[TYPE_0]], [[TYPE_0]], i64 0, i64 140}
+
+; CHECK-DAG: [[TYPE_7:!.*]] = !{{{.*}}, !"type_7"}
+; CHECK-DAG: [[TAG_7]] = !{[[TYPE_7]], [[TYPE_7]], i64 0, i64 1}
+
 ; CHECK-DAG: [[TYPE_3:!.*]] = !{{{.*}}, !"type_3"}
 ; CHECK-DAG: [[TAG_3]] = !{[[TYPE_3]], [[TYPE_3]], i64 0, i64 1}
+
 ; CHECK-DAG: [[TYPE_5:!.*]] = !{{{.*}}, !"type_5"}
 ; CHECK-DAG: [[TAG_5]] = !{[[TYPE_5]], [[TYPE_5]], i64 0, i64 1}
-; CHECK-DAG: [[TYPE_7:!.*]] = !{{{.*}}, !"type_7"}
-; CHECK-DAG: [[TAG_7]] = !{[[TYPE_7]], [[TYPE_7]], i64 0, i64 1}
+
+; CHECK-DAG: [[TAG_0_M4]] = !{[[TYPE_0]], [[TYPE_0]], i64 0, i64 196}
+
+
 ; CHECK-DAG: [[TYPE_9:!.*]] = !{{{.*}}, !"type_9"}
 ; CHECK-DAG: [[TAG_9]] = !{[[TYPE_9]], [[TYPE_9]], i64 0, i64 1}
 ; CHECK-DAG: [[TYPE_11:!.*]] = !{{{.*}}, !"type_11"}
@@ -2016,5 +2042,3 @@ bb7:
 ; CHECK-DAG: [[TAG_55]] = !{[[TYPE_55]], [[TYPE_55]], i64 0, i64 1}
 ; CHECK-DAG: [[TYPE_57:!.*]] = !{{{.*}}, !"type_57"}
 ; CHECK-DAG: [[TAG_57]] = !{[[TYPE_57]], [[TYPE_57]], i64 0, i64 1}
-; CHECK-DAG: [[TYPE_59:!.*]] = !{{{.*}}, !"type_59"}
-; CHECK-DAG: [[TAG_59]] = !{[[TYPE_59]], [[TYPE_59]], i64 0, i64 1}
diff --git a/llvm/test/Transforms/SROA/tbaa-struct2.ll b/llvm/test/Transforms/SROA/tbaa-struct2.ll
new file mode 100644
index 000000000000..75f72f4e9963
--- /dev/null
+++ b/llvm/test/Transforms/SROA/tbaa-struct2.ll
@@ -0,0 +1,51 @@
+; RUN: opt -S -sroa %s | FileCheck %s
+
+; SROA should correctly offset `!tbaa.struct` metadata
+
+%struct.Wishart = type { double, i32 }
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* writeonly, i8* readonly, i64, i1 immarg)
+declare double @subcall(double %g, i32 %m)
+
+define double @bar(%struct.Wishart* %wishart) {
+  %tmp = alloca %struct.Wishart, align 8
+  %tmpaddr = bitcast %struct.Wishart* %tmp to i8*
+  %waddr = bitcast %struct.Wishart* %wishart to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %tmpaddr, i8* align 8 %waddr, i64 16, i1 false), !tbaa.struct !2
+  %gamma = getelementptr inbounds %struct.Wishart, %struct.Wishart* %tmp, i32 0, i32 0
+  %lg = load double, double* %gamma, align 8, !tbaa !4
+  %m = getelementptr inbounds %struct.Wishart, %struct.Wishart* %tmp, i32 0, i32 1
+  %lm = load i32, i32* %m, align 8, !tbaa !8
+  %call = call double @subcall(double %lg, i32 %lm)
+  ret double %call
+}
+
+!2 = !{i64 0, i64 8, !3, i64 8, i64 4, !7}
+!3 = !{!4, !4, i64 0}
+!4 = !{!"double", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C++ TBAA"}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"int", !5, i64 0}
+
+; CHECK: define double @bar(%struct.Wishart* %wishart) {
+; CHECK-NEXT:   %tmp.sroa.3 = alloca [4 x i8], align 4
+; CHECK-NEXT:   %tmp.sroa.0.0.waddr.sroa_idx = getelementptr inbounds %struct.Wishart, %struct.Wishart* %wishart, i64 0, i32 0
+; CHECK-NEXT:   %tmp.sroa.0.0.copyload = load double, double* %tmp.sroa.0.0.waddr.sroa_idx, align 8, !tbaa.struct !0
+; CHECK-NEXT:   %tmp.sroa.2.0.waddr.sroa_idx1 = getelementptr inbounds %struct.Wishart, %struct.Wishart* %wishart, i64 0, i32 1
+; CHECK-NEXT:   %tmp.sroa.2.0.copyload = load i32, i32* %tmp.sroa.2.0.waddr.sroa_idx1, align 8, !tbaa.struct !7
+; CHECK-NEXT:   %tmp.sroa.3.0.waddr.sroa_raw_cast = bitcast %struct.Wishart* %wishart to i8*
+; CHECK-NEXT:   %tmp.sroa.3.0.waddr.sroa_raw_idx = getelementptr inbounds i8, i8* %tmp.sroa.3.0.waddr.sroa_raw_cast, i64 12
+; CHECK-NEXT:   %tmp.sroa.3.0.tmpaddr.sroa_idx = getelementptr inbounds [4 x i8], [4 x i8]* %tmp.sroa.3, i64 0, i64 0
+; CHECK-NEXT:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %tmp.sroa.3.0.tmpaddr.sroa_idx, i8* align 4 %tmp.sroa.3.0.waddr.sroa_raw_idx, i64 4, i1 false), !tbaa.struct !8
+; CHECK-NEXT:   %call = call double @subcall(double %tmp.sroa.0.0.copyload, i32 %tmp.sroa.2.0.copyload)
+; CHECK-NEXT:   ret double %call
+; CHECK-NEXT: }
+
+; CHECK: !0 = !{i64 0, i64 8, !1, i64 8, i64 4, !5}
+; CHECK: !1 = !{!2, !2, i64 0}
+; CHECK: !2 = !{!"double", !{{[0-9]+}}, i64 0}
+
+; CHECK: !5 = !{!6, !6, i64 0}
+; CHECK: !6 = !{!"int", !{{[0-9]+}}, i64 0}
+; CHECK: !7 = !{i64 0, i64 4, !5}
+; CHECK: !8 = !{}
\ No newline at end of file

From a7629a2244a325b908ddbd4336aef25a7049bda9 Mon Sep 17 00:00:00 2001
From: Yang Fan <nullptr.cpp@gmail.com>
Date: Wed, 3 Feb 2021 11:04:58 +0800
Subject: [PATCH 136/318] [CSSPGO] Fix MSVC initializing truncation warning
 (NFC)

MSVC warning:
```
\llvm-project\llvm\include\llvm\Transforms\IPO\SampleProfileProbe.h(65): warning C4305: 'initializing': truncation from 'double' to 'const float'
```
---
 llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h b/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h
index cab893b50d19..0fd79d8ff7f3 100644
--- a/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h
+++ b/llvm/include/llvm/Transforms/IPO/SampleProfileProbe.h
@@ -62,7 +62,7 @@ class PseudoProbeVerifier {
 
 private:
   // Allow a little bias due the rounding to integral factors.
-  constexpr static float DistributionFactorVariance = 0.02;
+  constexpr static float DistributionFactorVariance = 0.02f;
   // Distribution factors from last pass.
   FuncProbeFactorMap FunctionProbeFactors;
 

From 78b35e278a9f62c2a6cfe3c974155a7e9bb60361 Mon Sep 17 00:00:00 2001
From: wlei <wlei@fb.com>
Date: Mon, 11 Jan 2021 09:08:39 -0800
Subject: [PATCH 137/318] [CSSPGO][llvm-profgen] Pseudo probe based CS profile
 generation

This change implements profile generation infra for pseudo probe in llvm-profgen. During virtual unwinding, the raw profile is extracted into range counter and branch counter and aggregated to sample counter map indexed by the call stack context. This change introduces the last step and produces the eventual profile. Specifically, the body of function sample is recorded by going through each probe among the range and callsite target sample is recorded by extracting the callsite probe from branch's source.

Please refer https://groups.google.com/g/llvm-dev/c/1p1rdYbL93s and https://reviews.llvm.org/D89707 for more context about CSSPGO and llvm-profgen.

**Implementation**

- Extended `PseudoProbeProfileGenerator` for pseudo probe based profile generation.
- `populateBodySamplesWithProbes` reading range counter is responsible for recording function body samples and inferring caller's body samples.
- `populateBoundarySamplesWithProbes` reading branch counter is responsible for recording call site target samples.
- Each sample is recorded with its calling context(named `ContextId`). Remind that the probe based context key doesn't include the leaf frame probe info, so the `ContextId` string is created from two part: one from the probe stack strings' concatenation and other one from the leaf frame probe.
- Added regression test

Test Plan:

ninja & ninja check-llvm

Differential Revision: https://reviews.llvm.org/D92998
---
 .../llvm-profgen/inline-cs-pseudoprobe.test   |  17 ++
 .../llvm-profgen/noinline-cs-pseudoprobe.test |  16 ++
 llvm/tools/llvm-profgen/PerfReader.cpp        |   4 -
 llvm/tools/llvm-profgen/ProfileGenerator.cpp  | 198 +++++++++++++++++-
 llvm/tools/llvm-profgen/ProfileGenerator.h    |  41 +++-
 llvm/tools/llvm-profgen/ProfiledBinary.h      |  11 +-
 llvm/tools/llvm-profgen/PseudoProbe.cpp       |  36 ++--
 llvm/tools/llvm-profgen/PseudoProbe.h         |  13 +-
 8 files changed, 307 insertions(+), 29 deletions(-)

diff --git a/llvm/test/tools/llvm-profgen/inline-cs-pseudoprobe.test b/llvm/test/tools/llvm-profgen/inline-cs-pseudoprobe.test
index 109f2f63e86d..19928322a66d 100644
--- a/llvm/test/tools/llvm-profgen/inline-cs-pseudoprobe.test
+++ b/llvm/test/tools/llvm-profgen/inline-cs-pseudoprobe.test
@@ -1,4 +1,21 @@
 ; RUN: llvm-profgen --perfscript=%S/Inputs/inline-cs-pseudoprobe.perfscript --binary=%S/Inputs/inline-cs-pseudoprobe.perfbin --output=%t --show-unwinder-output | FileCheck %s --check-prefix=CHECK-UNWINDER
+; RUN: FileCheck %s --input-file %t
+
+; CHECK:     [main:2 @ foo]:74:0
+; CHECK-NEXT: 2: 15
+; CHECK-NEXT: 3: 15
+; CHECK-NEXT: 4: 14
+; CHECK-NEXT: 5: 1
+; CHECK-NEXT: 6: 15
+; CHECK-NEXT: 8: 14 bar:14
+; CHECK-NEXT: !CFGChecksum: 138950591924
+; CHECK-NEXT:[main:2 @ foo:8 @ bar]:56:14
+; CHECK-NEXT: 1: 14
+; CHECK-NEXT: 2: 14
+; CHECK-NEXT: 3: 14
+; CHECK-NEXT: 4: 14
+; CHECK-NEXT: !CFGChecksum: 72617220756
+
 
 ; CHECK-UNWINDER:      Binary(inline-cs-pseudoprobe.perfbin)'s Range Counter:
 ; CHECK-UNWINDER-EMPTY:
diff --git a/llvm/test/tools/llvm-profgen/noinline-cs-pseudoprobe.test b/llvm/test/tools/llvm-profgen/noinline-cs-pseudoprobe.test
index 2ac3f06587d9..0491a62ff69b 100644
--- a/llvm/test/tools/llvm-profgen/noinline-cs-pseudoprobe.test
+++ b/llvm/test/tools/llvm-profgen/noinline-cs-pseudoprobe.test
@@ -1,4 +1,20 @@
 ; RUN: llvm-profgen --perfscript=%S/Inputs/noinline-cs-pseudoprobe.perfscript --binary=%S/Inputs/noinline-cs-pseudoprobe.perfbin --output=%t --show-unwinder-output | FileCheck %s --check-prefix=CHECK-UNWINDER
+; RUN: FileCheck %s --input-file %t
+
+; CHECK:     [main:2 @ foo]:75:0
+; CHECK-NEXT: 2: 15
+; CHECK-NEXT: 3: 15
+; CHECK-NEXT: 4: 15
+; CHECK-NEXT: 6: 15
+; CHECK-NEXT: 8: 15 bar:15
+; CHECK-NEXT: !CFGChecksum: 138950591924
+; CHECK-NEXT:[main:2 @ foo:8 @ bar]:60:15
+; CHECK-NEXT: 1: 15
+; CHECK-NEXT: 2: 15
+; CHECK-NEXT: 3: 15
+; CHECK-NEXT: 4: 15
+; CHECK-NEXT: !CFGChecksum: 72617220756
+
 
 ; CHECK-UNWINDER:      Binary(noinline-cs-pseudoprobe.perfbin)'s Range Counter:
 ; CHECK-UNWINDER-NEXT: main:2
diff --git a/llvm/tools/llvm-profgen/PerfReader.cpp b/llvm/tools/llvm-profgen/PerfReader.cpp
index d08c15808cf4..64a502be59a9 100644
--- a/llvm/tools/llvm-profgen/PerfReader.cpp
+++ b/llvm/tools/llvm-profgen/PerfReader.cpp
@@ -567,11 +567,7 @@ void PerfReader::checkAndSetPerfType(
   }
 
   if (HasHybridPerf) {
-    // Set up ProfileIsCS to enable context-sensitive functionalities
-    // in SampleProf
-    FunctionSamples::ProfileIsCS = true;
     PerfType = PERF_LBR_STACK;
-
   } else {
     // TODO: Support other type of perf script
     PerfType = PERF_INVILID;
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
index 7624fd3f2808..ce228a781538 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -67,7 +67,7 @@ void ProfileGenerator::findDisjointRanges(RangeSample &DisjointRanges,
   /*
   Regions may overlap with each other. Using the boundary info, find all
   disjoint ranges and their sample count. BoundaryPoint contains the count
-  mutiple samples begin/end at this points.
+  multiple samples begin/end at this points.
 
   |<--100-->|           Sample1
   |<------200------>|   Sample2
@@ -264,9 +264,12 @@ static FrameLocation getCallerContext(StringRef CalleeContext,
   StringRef CallerContext = CalleeContext.rsplit(" @ ").first;
   CallerNameWithContext = CallerContext.rsplit(':').first;
   auto ContextSplit = CallerContext.rsplit(" @ ");
+  StringRef CallerFrameStr = ContextSplit.second.size() == 0
+                                 ? ContextSplit.first
+                                 : ContextSplit.second;
   FrameLocation LeafFrameLoc = {"", {0, 0}};
   StringRef Funcname;
-  SampleContext::decodeContextString(ContextSplit.second, Funcname,
+  SampleContext::decodeContextString(CallerFrameStr, Funcname,
                                      LeafFrameLoc.second);
   LeafFrameLoc.first = Funcname.str();
   return LeafFrameLoc;
@@ -316,5 +319,196 @@ void CSProfileGenerator::populateInferredFunctionSamples() {
   }
 }
 
+// Helper function to extract context prefix
+// PrefixContextId is the context id string except for the leaf probe's
+// context, the final ContextId will be:
+// ContextId =  PrefixContextId + LeafContextId;
+// Remind that the string in ContextStrStack is in callee-caller order
+// So process the string vector reversely
+static std::string
+extractPrefixContextId(const SmallVector<const PseudoProbe *, 16> &Probes,
+                       ProfiledBinary *Binary) {
+  SmallVector<std::string, 16> ContextStrStack;
+  for (const auto *P : Probes) {
+    Binary->getInlineContextForProbe(P, ContextStrStack, true);
+  }
+  std::ostringstream OContextStr;
+  for (auto &CxtStr : ContextStrStack) {
+    if (OContextStr.str().size())
+      OContextStr << " @ ";
+    OContextStr << CxtStr;
+  }
+  return OContextStr.str();
+}
+
+void PseudoProbeCSProfileGenerator::generateProfile() {
+  // Enable CS and pseudo probe functionalities in SampleProf
+  FunctionSamples::ProfileIsCS = true;
+  FunctionSamples::ProfileIsProbeBased = true;
+  for (const auto &BI : BinarySampleCounters) {
+    ProfiledBinary *Binary = BI.first;
+    for (const auto &CI : BI.second) {
+      const ProbeBasedCtxKey *CtxKey =
+          dyn_cast<ProbeBasedCtxKey>(CI.first.getPtr());
+      std::string PrefixContextId =
+          extractPrefixContextId(CtxKey->Probes, Binary);
+      // Fill in function body samples from probes, also infer caller's samples
+      // from callee's probe
+      populateBodySamplesWithProbes(CI.second.RangeCounter, PrefixContextId,
+                                    Binary);
+      // Fill in boundary samples for a call probe
+      populateBoundarySamplesWithProbes(CI.second.BranchCounter,
+                                        PrefixContextId, Binary);
+    }
+  }
+}
+
+void PseudoProbeCSProfileGenerator::extractProbesFromRange(
+    const RangeSample &RangeCounter, ProbeCounterMap &ProbeCounter,
+    ProfiledBinary *Binary) {
+  RangeSample Ranges;
+  findDisjointRanges(Ranges, RangeCounter);
+  for (const auto &Range : Ranges) {
+    uint64_t RangeBegin = Binary->offsetToVirtualAddr(Range.first.first);
+    uint64_t RangeEnd = Binary->offsetToVirtualAddr(Range.first.second);
+    uint64_t Count = Range.second;
+    // Disjoint ranges have introduce zero-filled gap that
+    // doesn't belong to current context, filter them out.
+    if (Count == 0)
+      continue;
+
+    InstructionPointer IP(Binary, RangeBegin, true);
+
+    // Disjoint ranges may have range in the middle of two instr,
+    // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range
+    // can be Addr1+1 to Addr2-1. We should ignore such range.
+    if (IP.Address > RangeEnd)
+      continue;
+
+    while (IP.Address <= RangeEnd) {
+      const AddressProbesMap &Address2ProbesMap =
+          Binary->getAddress2ProbesMap();
+      auto It = Address2ProbesMap.find(IP.Address);
+      if (It != Address2ProbesMap.end()) {
+        for (const auto &Probe : It->second) {
+          if (!Probe.isBlock())
+            continue;
+          ProbeCounter[&Probe] += Count;
+        }
+      }
+
+      IP.advance();
+    }
+  }
+}
+
+void PseudoProbeCSProfileGenerator::populateBodySamplesWithProbes(
+    const RangeSample &RangeCounter, StringRef PrefixContextId,
+    ProfiledBinary *Binary) {
+  ProbeCounterMap ProbeCounter;
+  // Extract the top frame probes by looking up each address among the range in
+  // the Address2ProbeMap
+  extractProbesFromRange(RangeCounter, ProbeCounter, Binary);
+  for (auto PI : ProbeCounter) {
+    const PseudoProbe *Probe = PI.first;
+    uint64_t Count = PI.second;
+    FunctionSamples &FunctionProfile =
+        getFunctionProfileForLeafProbe(PrefixContextId, Probe, Binary);
+
+    FunctionProfile.addBodySamples(Probe->Index, 0, Count);
+    FunctionProfile.addTotalSamples(Count);
+    if (Probe->isEntry()) {
+      FunctionProfile.addHeadSamples(Count);
+      // Look up for the caller's function profile
+      const auto *InlinerDesc = Binary->getInlinerDescForProbe(Probe);
+      if (InlinerDesc != nullptr) {
+        // Since the context id will be compressed, we have to use callee's
+        // context id to infer caller's context id to ensure they share the
+        // same context prefix.
+        StringRef CalleeContextId =
+            FunctionProfile.getContext().getNameWithContext(true);
+        StringRef CallerContextId;
+        FrameLocation &&CallerLeafFrameLoc =
+            getCallerContext(CalleeContextId, CallerContextId);
+        uint64_t CallerIndex = CallerLeafFrameLoc.second.LineOffset;
+        assert(CallerIndex &&
+               "Inferred caller's location index shouldn't be zero!");
+        FunctionSamples &CallerProfile =
+            getFunctionProfileForContext(CallerContextId);
+        CallerProfile.setFunctionHash(InlinerDesc->FuncHash);
+        CallerProfile.addBodySamples(CallerIndex, 0, Count);
+        CallerProfile.addTotalSamples(Count);
+        CallerProfile.addCalledTargetSamples(CallerIndex, 0,
+                                             FunctionProfile.getName(), Count);
+      }
+    }
+  }
+}
+
+void PseudoProbeCSProfileGenerator::populateBoundarySamplesWithProbes(
+    const BranchSample &BranchCounter, StringRef PrefixContextId,
+    ProfiledBinary *Binary) {
+  for (auto BI : BranchCounter) {
+    uint64_t SourceOffset = BI.first.first;
+    uint64_t TargetOffset = BI.first.second;
+    uint64_t Count = BI.second;
+    uint64_t SourceAddress = Binary->offsetToVirtualAddr(SourceOffset);
+    const PseudoProbe *CallProbe = Binary->getCallProbeForAddr(SourceAddress);
+    if (CallProbe == nullptr)
+      continue;
+    FunctionSamples &FunctionProfile =
+        getFunctionProfileForLeafProbe(PrefixContextId, CallProbe, Binary);
+    FunctionProfile.addBodySamples(CallProbe->Index, 0, Count);
+    FunctionProfile.addTotalSamples(Count);
+    StringRef CalleeName = FunctionSamples::getCanonicalFnName(
+        Binary->getFuncFromStartOffset(TargetOffset));
+    if (CalleeName.size() == 0)
+      continue;
+    FunctionProfile.addCalledTargetSamples(CallProbe->Index, 0, CalleeName,
+                                           Count);
+  }
+}
+
+FunctionSamples &PseudoProbeCSProfileGenerator::getFunctionProfileForLeafProbe(
+    StringRef PrefixContextId, SmallVector<std::string, 16> &LeafInlinedContext,
+    const PseudoProbeFuncDesc *LeafFuncDesc) {
+  assert(LeafInlinedContext.size() &&
+         "Profile context must have the leaf frame");
+  std::ostringstream OContextStr;
+  OContextStr << PrefixContextId.str();
+
+  for (uint32_t I = 0; I < LeafInlinedContext.size() - 1; I++) {
+    if (OContextStr.str().size())
+      OContextStr << " @ ";
+    OContextStr << LeafInlinedContext[I];
+  }
+  // For leaf inlined context with the top frame, we should strip off the top
+  // frame's probe id, like:
+  // Inlined stack: [foo:1, bar:2], the ContextId will be "foo:1 @ bar"
+  if (OContextStr.str().size())
+    OContextStr << " @ ";
+  StringRef LeafLoc = LeafInlinedContext.back();
+  OContextStr << LeafLoc.split(":").first.str();
+
+  FunctionSamples &FunctionProile =
+      getFunctionProfileForContext(OContextStr.str());
+  FunctionProile.setFunctionHash(LeafFuncDesc->FuncHash);
+  return FunctionProile;
+}
+
+FunctionSamples &PseudoProbeCSProfileGenerator::getFunctionProfileForLeafProbe(
+    StringRef PrefixContextId, const PseudoProbe *LeafProbe,
+    ProfiledBinary *Binary) {
+  SmallVector<std::string, 16> LeafInlinedContext;
+  Binary->getInlineContextForProbe(LeafProbe, LeafInlinedContext);
+  // Note that the context from probe doesn't include leaf frame,
+  // hence we need to retrieve and append the leaf frame.
+  const auto *FuncDesc = Binary->getFuncDescForGUID(LeafProbe->GUID);
+  LeafInlinedContext.emplace_back(FuncDesc->FuncName + ":" +
+                                  Twine(LeafProbe->Index).str());
+  return getFunctionProfileForLeafProbe(PrefixContextId, LeafInlinedContext,
+                                        FuncDesc);
+}
+
 } // end namespace sampleprof
 } // end namespace llvm
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.h b/llvm/tools/llvm-profgen/ProfileGenerator.h
index 8040b90ea61a..29f528026a0c 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.h
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.h
@@ -25,7 +25,7 @@ class ProfileGenerator {
   ProfileGenerator(){};
   virtual ~ProfileGenerator() = default;
   static std::unique_ptr<ProfileGenerator>
-  create(const BinarySampleCounterMap &SampleCounters,
+  create(const BinarySampleCounterMap &BinarySampleCounters,
          enum PerfScriptType SampleType);
   virtual void generateProfile() = 0;
 
@@ -50,7 +50,6 @@ class ProfileGenerator {
   */
   void findDisjointRanges(RangeSample &DisjointRanges,
                           const RangeSample &Ranges);
-
   // Used by SampleProfileWriter
   StringMap<FunctionSamples> ProfileMap;
 };
@@ -65,6 +64,8 @@ class CSProfileGenerator : public ProfileGenerator {
 
 public:
   void generateProfile() override {
+    // Enable context-sensitive functionalities in SampleProf
+    FunctionSamples::ProfileIsCS = true;
     for (const auto &BI : BinarySampleCounters) {
       ProfiledBinary *Binary = BI.first;
       for (const auto &CI : BI.second) {
@@ -90,14 +91,16 @@ class CSProfileGenerator : public ProfileGenerator {
     populateInferredFunctionSamples();
   }
 
+protected:
+  // Lookup or create FunctionSamples for the context
+  FunctionSamples &getFunctionProfileForContext(StringRef ContextId);
+
 private:
   // Helper function for updating body sample for a leaf location in
   // FunctionProfile
   void updateBodySamplesforFunctionProfile(FunctionSamples &FunctionProfile,
                                            const FrameLocation &LeafLoc,
                                            uint64_t Count);
-  // Lookup or create FunctionSamples for the context
-  FunctionSamples &getFunctionProfileForContext(StringRef ContextId);
   void populateFunctionBodySamples(FunctionSamples &FunctionProfile,
                                    const RangeSample &RangeCounters,
                                    ProfiledBinary *Binary);
@@ -108,14 +111,38 @@ class CSProfileGenerator : public ProfileGenerator {
   void populateInferredFunctionSamples();
 };
 
+using ProbeCounterMap = std::unordered_map<const PseudoProbe *, uint64_t>;
+
 class PseudoProbeCSProfileGenerator : public CSProfileGenerator {
 
 public:
   PseudoProbeCSProfileGenerator(const BinarySampleCounterMap &Counters)
       : CSProfileGenerator(Counters) {}
-  void generateProfile() override {
-    // TODO
-  }
+  void generateProfile() override;
+
+private:
+  // Go through each address from range to extract the top frame probe by
+  // looking up in the Address2ProbeMap
+  void extractProbesFromRange(const RangeSample &RangeCounter,
+                              ProbeCounterMap &ProbeCounter,
+                              ProfiledBinary *Binary);
+  // Fill in function body samples from probes
+  void populateBodySamplesWithProbes(const RangeSample &RangeCounter,
+                                     StringRef PrefixContextId,
+                                     ProfiledBinary *Binary);
+  // Fill in boundary samples for a call probe
+  void populateBoundarySamplesWithProbes(const BranchSample &BranchCounter,
+                                         StringRef PrefixContextId,
+                                         ProfiledBinary *Binary);
+  // Helper function to get FunctionSamples for the leaf inlined context
+  FunctionSamples &getFunctionProfileForLeafProbe(
+      StringRef PrefixContextId,
+      SmallVector<std::string, 16> &LeafInlinedContext,
+      const PseudoProbeFuncDesc *LeafFuncDesc);
+  // Helper function to get FunctionSamples for the leaf probe
+  FunctionSamples &getFunctionProfileForLeafProbe(StringRef PrefixContextId,
+                                                  const PseudoProbe *LeafProbe,
+                                                  ProfiledBinary *Binary);
 };
 
 } // end namespace sampleprof
diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.h b/llvm/tools/llvm-profgen/ProfiledBinary.h
index bb028da2b484..40aee39677e5 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.h
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.h
@@ -244,10 +244,19 @@ class ProfiledBinary {
   void
   getInlineContextForProbe(const PseudoProbe *Probe,
                            SmallVector<std::string, 16> &InlineContextStack,
-                           bool IncludeLeaf) const {
+                           bool IncludeLeaf = false) const {
     return ProbeDecoder.getInlineContextForProbe(Probe, InlineContextStack,
                                                  IncludeLeaf);
   }
+  const AddressProbesMap &getAddress2ProbesMap() const {
+    return ProbeDecoder.getAddress2ProbesMap();
+  }
+  const PseudoProbeFuncDesc *getFuncDescForGUID(uint64_t GUID) {
+    return ProbeDecoder.getFuncDescForGUID(GUID);
+  }
+  const PseudoProbeFuncDesc *getInlinerDescForProbe(const PseudoProbe *Probe) {
+    return ProbeDecoder.getInlinerDescForProbe(Probe);
+  }
 };
 
 } // end namespace sampleprof
diff --git a/llvm/tools/llvm-profgen/PseudoProbe.cpp b/llvm/tools/llvm-profgen/PseudoProbe.cpp
index 0b53f1aa02e7..700984e2184a 100644
--- a/llvm/tools/llvm-profgen/PseudoProbe.cpp
+++ b/llvm/tools/llvm-profgen/PseudoProbe.cpp
@@ -41,7 +41,7 @@ void PseudoProbe::getInlineContext(SmallVector<std::string, 16> &ContextStack,
   PseudoProbeInlineTree *Cur = InlineTree;
   // It will add the string of each node's inline site during iteration.
   // Note that it won't include the probe's belonging function(leaf location)
-  while (!Cur->hasInlineSite()) {
+  while (Cur->hasInlineSite()) {
     std::string ContextStr;
     if (ShowName) {
       StringRef FuncName =
@@ -312,22 +312,32 @@ PseudoProbeDecoder::getCallProbeForAddr(uint64_t Address) const {
   return CallProbe;
 }
 
+const PseudoProbeFuncDesc *
+PseudoProbeDecoder::getFuncDescForGUID(uint64_t GUID) const {
+  auto It = GUID2FuncDescMap.find(GUID);
+  assert(It != GUID2FuncDescMap.end() && "Function descriptor doesn't exist");
+  return &It->second;
+}
+
 void PseudoProbeDecoder::getInlineContextForProbe(
     const PseudoProbe *Probe, SmallVector<std::string, 16> &InlineContextStack,
     bool IncludeLeaf) const {
-  if (IncludeLeaf) {
-    // Note that the context from probe doesn't include leaf frame,
-    // hence we need to retrieve and prepend leaf if requested.
-    auto It = GUID2FuncDescMap.find(Probe->GUID);
-    assert(It != GUID2FuncDescMap.end() &&
-           "Should have function descriptor for a valid GUID");
-    StringRef FuncName = It->second.FuncName;
-    // InlineContextStack is in callee-caller order, so push leaf in the front
-    InlineContextStack.emplace_back(FuncName.str() + ":" +
-                                    Twine(Probe->Index).str());
-  }
-
   Probe->getInlineContext(InlineContextStack, GUID2FuncDescMap, true);
+  if (!IncludeLeaf)
+    return;
+  // Note that the context from probe doesn't include leaf frame,
+  // hence we need to retrieve and prepend leaf if requested.
+  const auto *FuncDesc = getFuncDescForGUID(Probe->GUID);
+  InlineContextStack.emplace_back(FuncDesc->FuncName + ":" +
+                                  Twine(Probe->Index).str());
+}
+
+const PseudoProbeFuncDesc *
+PseudoProbeDecoder::getInlinerDescForProbe(const PseudoProbe *Probe) const {
+  PseudoProbeInlineTree *InlinerNode = Probe->InlineTree;
+  if (!InlinerNode->hasInlineSite())
+    return nullptr;
+  return getFuncDescForGUID(std::get<0>(InlinerNode->ISite));
 }
 
 } // end namespace sampleprof
diff --git a/llvm/tools/llvm-profgen/PseudoProbe.h b/llvm/tools/llvm-profgen/PseudoProbe.h
index 25769cad8805..a6647eb39c7a 100644
--- a/llvm/tools/llvm-profgen/PseudoProbe.h
+++ b/llvm/tools/llvm-profgen/PseudoProbe.h
@@ -73,7 +73,7 @@ class PseudoProbeInlineTree {
 
   void addProbes(PseudoProbe *Probe) { ProbeVector.push_back(Probe); }
   // Return false if it's a dummy inline site
-  bool hasInlineSite() const { return !std::get<0>(ISite); }
+  bool hasInlineSite() const { return std::get<0>(ISite) != 0; }
 };
 
 // Function descriptor decoded from .pseudo_probe_desc section
@@ -203,17 +203,26 @@ class PseudoProbeDecoder {
   // Look up the probe of a call for the input address
   const PseudoProbe *getCallProbeForAddr(uint64_t Address) const;
 
+  const PseudoProbeFuncDesc *getFuncDescForGUID(uint64_t GUID) const;
+
   // Helper function to populate one probe's inline stack into
   // \p InlineContextStack.
   // Current leaf location info will be added if IncludeLeaf is true
   // Example:
   //  Current probe(bar:3) inlined at foo:2 then inlined at main:1
   //  IncludeLeaf = true,  Output: [main:1, foo:2, bar:3]
-  //  IncludeLeaf = false, OUtput: [main:1, foo:2]
+  //  IncludeLeaf = false, Output: [main:1, foo:2]
   void
   getInlineContextForProbe(const PseudoProbe *Probe,
                            SmallVector<std::string, 16> &InlineContextStack,
                            bool IncludeLeaf) const;
+
+  const AddressProbesMap &getAddress2ProbesMap() const {
+    return Address2ProbesMap;
+  }
+
+  const PseudoProbeFuncDesc *
+  getInlinerDescForProbe(const PseudoProbe *Probe) const;
 };
 
 } // end namespace sampleprof

From 6209b0756d5df805f6279d3dadc8d2ba8648c3eb Mon Sep 17 00:00:00 2001
From: wlei <wlei@fb.com>
Date: Fri, 29 Jan 2021 15:00:08 -0800
Subject: [PATCH 138/318] [CSSPGO][llvm-profgen] Compress recursive cycles in
 calling context
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This change compresses the context string by removing cycles due to recursive function for CS profile generation. Removing recursion cycles is a way to normalize the calling context which will be better for the sample aggregation and also make the context promoting deterministic.
Specifically for implementation, we recognize adjacent repeated frames as cycles and deduplicated them through multiple round of iteration.
For example:
Considering a input context string stack:
[“a”, “a”, “b”, “c”, “a”, “b”, “c”, “b”, “c”, “d”]
For first iteration,, it removed all adjacent repeated frames of size 1:
[“a”, “b”, “c”, “a”, “b”, “c”, “b”, “c”, “d”]
For second iteration, it removed all adjacent repeated frames of size 2:
[“a”, “b”, “c”, “a”, “b”, “c”, “d”]
So in the end, we get compressed output:
[“a”, “b”, “c”, “d”]

Compression will be called in two place: one for sample's context key right after unwinding, one is for the eventual context string id in the ProfileGenerator.
Added a switch `compress-recursion` to control the size of duplicated frames, default -1 means no size limit.
Added unit tests and regression test for this.

Differential Revision: https://reviews.llvm.org/D93556
---
 .../recursion-compression-noprobe.perfbin     | Bin 0 -> 15352 bytes
 .../recursion-compression-noprobe.perfscript  |   4 +
 .../recursion-compression-pseudoprobe.perfbin | Bin 0 -> 13584 bytes
 ...cursion-compression-pseudoprobe.perfscript |  23 +++
 .../recursion-compression-noprobe.test        |  65 +++++++
 .../recursion-compression-pseudoprobe.test    | 169 ++++++++++++++++++
 llvm/tools/llvm-profgen/PerfReader.cpp        |   3 +
 llvm/tools/llvm-profgen/ProfileGenerator.cpp  |  88 ++++-----
 llvm/tools/llvm-profgen/ProfileGenerator.h    | 138 ++++++++++++--
 llvm/tools/llvm-profgen/ProfiledBinary.cpp    |  20 ++-
 llvm/tools/llvm-profgen/ProfiledBinary.h      |   2 +-
 llvm/tools/llvm-profgen/PseudoProbe.cpp       |   4 +-
 llvm/tools/llvm-profgen/PseudoProbe.h         |   4 +-
 llvm/unittests/tools/CMakeLists.txt           |   2 +-
 .../tools/llvm-profgen/CMakeLists.txt         |  11 ++
 .../llvm-profgen/ContextCompressionTest.cpp   |  36 ++++
 16 files changed, 498 insertions(+), 71 deletions(-)
 create mode 100755 llvm/test/tools/llvm-profgen/Inputs/recursion-compression-noprobe.perfbin
 create mode 100644 llvm/test/tools/llvm-profgen/Inputs/recursion-compression-noprobe.perfscript
 create mode 100755 llvm/test/tools/llvm-profgen/Inputs/recursion-compression-pseudoprobe.perfbin
 create mode 100644 llvm/test/tools/llvm-profgen/Inputs/recursion-compression-pseudoprobe.perfscript
 create mode 100644 llvm/test/tools/llvm-profgen/recursion-compression-noprobe.test
 create mode 100644 llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test
 create mode 100644 llvm/unittests/tools/llvm-profgen/CMakeLists.txt
 create mode 100644 llvm/unittests/tools/llvm-profgen/ContextCompressionTest.cpp

diff --git a/llvm/test/tools/llvm-profgen/Inputs/recursion-compression-noprobe.perfbin b/llvm/test/tools/llvm-profgen/Inputs/recursion-compression-noprobe.perfbin
new file mode 100755
index 0000000000000000000000000000000000000000..e4e698e910997a28befe523be7293cbe5193fcb9
GIT binary patch
literal 15352
zcmeHOeQ+Dcb>9OB5F{lK)JH^GwFOzyL?0nYP?SX3QUp*U2|CnAB+8;{OD7OKkg!02
zgAZBqxRq+k?!d8Zwu#eBXEJuD<Br_%A8w~fJX3ojRf;4xp4Lv?X*`ZwS+P^9QpR%J
zD6}05eQ)nQ;O=m6Jnl4cXS~D1+x@+defxHA?{;r_v}d5#p(uirLws3aX@@H$A^lf~
znGRVbtV=YAkXR~~2sa=#aD^lXsh(px=Fk~TJtXG@ih3uBf6@_x5|%?q#P7-9GGz~u
zx`?Nf5S4U#pk6OB6~Zq8)I(pd&6Y)=2V{n6C;8Dul=ZNPdcrHkfgaE^L@$zKOu2p7
zHoPv7y$g;Il&~B^LSB`rRd1T~rX3*^7fk)KRMx{fFztFfH}Yxf+d+C~$$_(;khI5i
zP?m&X%JqE}^kiI!|C@-91a?w=)%w9l?Pf~h6>Z7H@Y>F{<Vb5Wk;)%#J>0dnb!}%L
zn-1s$6aB(EaBTE#+buxv6k^SAVPZboLp~-PgH)Z&^H@e3jH4VT_5*b=VVN%pe8Qi0
zkn`KfHUKuq21m>e9{}8MBY#&FTtNI{7_PNqB9lnv;zCssQnR^eCZ~=^6DiR*(BHjT
z?Fe)P)(EwKaH~3^Wwg;mHm79<w{A|RQ`%s3IH`f==y*CsnyM~pD=7VQm}03w)G%&5
z9`ZLO>M+_%sVx^956B$$1<G-nHsIU`$h&61v5z^;7;xi!(d;`QMQY*K*vIcR-ts|Y
z@~S8DHP>tPei3=_RL)U+IWl?HbB0YyT3quZU>BFY3crnuLXgAq81gP(EEbDsC;0RJ
z!YlhDg-emix329N>_2~M3ht`N`O|($&hMxNz4`Zm-hWS_{F!*5anW&UekyWe<DUQ{
za>Dg6lJzsW`QXx8%!5mH#fy!Lj-likq97lY@@sw{$%-41(>I)v!nMfh=}i&kT;!D-
zxp`n<HW}~~FUA`eZ7J(x{l_-809)ke?v51Rp1u!San4l>h_dg@6x0LKWd~9I%ERSy
zVWynNcofdRGCf(WKl-c5^To@J;HZ%emL`ji`xlq7snGTFN`5W~pXgI2XB<c0`9Ah~
zVdf0#KLM_kWu+&(!*}lb8|4KQ`g`oZD+Xg&e^@y;`2%J0j1qkDk3MSpq7b<qh3V_3
zmHf{NKSutEND)iTg$tAKJC45l92&avkL5}xPu(aN8W)+`GrbA?x$^DrKL_2orEooX
z>aOtqeV0G@$mnE+f6FFM8QL&a;a}tNqu5~S<E6rDC&I;(O1`CV8iVl*-Mt@{J(`+4
zZSn8IE0-r{Fy1rH`@a#NqW0Bc`<#F9?v)oz4T?`0`)4|RqgcFr4>YRWr0G@A9KB`G
z^&x@%9p2Ml__y$#{e|n{-QmJV;lY6u%l^*g7rP@TT9NJ=Sn)RAJJa9$b+H&Z{gE@b
zDEK;!&w;`R1BL5b3h#uA^L`S6eu=DqBmZA`U){AYe0O+Xcz;+uGgWS&ydNMSNag)V
zcc5arTZPEx$Nh<{f7wVq%o7Wpx9<QQm`FqtT=)jiM}Q`QJ`eO5(EFiT&jWo1=*vK_
z0(}eUBM<@~41&8M3LAjF1oSx2dBDS@+_zIGhkeSzS+$-iz;WKfcavtY+lB&LYJG^c
z!_^5G&oxVt1=q_T7K?tM!t3kxHs8_MaIfc>*mTqFD_1YQ6=BpxKc55L#Rxz`>=<0H
z0u}^{X*pcjE&+aj6KUA%>-IK>z5dCXN1YEj91nY&yS=`!7oOuNdkU@`$i4)$LiTJ~
z7Ja-5S2M&xL)l)h?=eS@xA{?LkJo>qrpLSd5m&_9{*XK3?V7CZ^M+F1uCTW~>|NgN
z^@9gcX*X1gK6gTVLSTnufaX3=pGV;H2z+)T(C!ZD&JcxLRwkyisO+KjA1#-}x8RwF
zsYU+cI_iMTEu=C`J2KrwbcXo6j$?ie@jcFv%>D94F^!onD)ZbiO=VtV3{e@Tdztb?
z8;J6I{(LGkDZMMwdmq2^u{_W7_&$hfflkD=nvl#fK1F42#{rVVXAUN|2h*Qy&jiWC
z<3yGfT})h~@_ynI{JSH(p0#+;OZ|^<-{#G?`Ip10X@$QlusRU*ceDrB2ZJ3!|MH#M
zh(8j|>3Hj!6=vB^DcjNBzP_W&C<}ONaD|8^qp4B<AuW?lq*MN2N1#2>?q8nGj@{PQ
zHk!x<4kac6@n}pNPNxqBV(Ia=k!UW8oQyV+&bFnq;~JE;!>~R{MU!pG<e~A_iA?$)
zEtd1ITN7Oq53cU&SQG7B+c_Ky#)pH$@ip;vT5x@=qkUbpGZtIDHV7UIfny4vU%24t
z3olhxkFD=m4MpAqVOk*<`-F1YPbjB-Lb=bFe|S%r;uj5;Vt16rk(~_lm{!DhLXQXY
znJ&V<!ld_?E0a&95-^fRV=>DB6~ZbTO{LPqiIlA@&n>#$m3`ayh?PB1jayd6^{tds
zNTUf`1*_=TK@|pcc1(!5Cf>?eT!m4YgHZ{?TtFarPNjN!x9=S6@7*hksP+Yu?kc$?
z9~x4_yLR<})b?$=fN$}q7UwvgH+7qfG2B*L>G4EPjc20cnmUn&EdVV8GCbdtFK*mx
zXacke4PcTnUaYq-qeYX6(G<ef`sx_<_k%>O@&CN9e*1mJ;|;fY`@$=2$D7WxA8&T|
zF841}I~jh5xaDK!0DiA{(i6gAg||n=Trx%aYw4Y(+StSG6-<v4{aueeyf6O`<_S6f
ze42Rz6@T0FM3u2;YQx8kz5lI##aQ`&@a*LA%hc|kU>;LD8NTn9e9ZmF_uW%;&)_zT
z{*LZ}*#e&r4<Ui{7+fK`1AhWM2NxCtL)r6f_?0i)PeZ0=5PqU0EF2S*!_5q3&mW@X
z68L!-u9}tb^A{}X`XQ8*I^5t;+!srEQTtDjch$IlgiwuvE%U<hEE$lTJxjqm559GO
z2{V5RnRSq<!+L*8f;-UAoO<UQK+%mlbl+VYf?pw8;ZhprIq}(tt%0W<wkJqz(eSTO
zQW}+6uOanerL$fyWyNgV-^GM`FUlUQLiqv;)uCe(#C!_j1x~NBUwXL&SS2t2m}HKk
z40w4zFiT!OMa(EN!OOkCL@#CgUL@v5WV-7cOImLdV;N~ZL0V9x<|&?#EP!H@vdlZz
z<@EX%Ec7-hbKrf$5dd(GJJkGV-7P_Qzj3;RV*zGr3J)kEIQQ}HmL{<13MfwRZ4I|K
ze6gi}?xq$$tj^pnp+o&3UAq(tUv6js+|x-G>TXtobJ6%v!`_BHz^T_wqP+&Tr`wv<
zX|Ofh;1buc36`f`Fx-r;HM+ntXp#zsYJ?whilThzos9$_>|I49Qt&-V3&fxdsjf7r
z=et_!J&Ro)cfjS|;A(WYx$1!mw`Y^fQ$N?`hJwfaBG5aaefa<sob?-^7<V2DxD>Z@
z_K<7dZ7$)t2X@ju|LJPD-Q@%o?~uzG2Zc0P1%-gyd1we#8{$%x>>zUk*l~LnxPR_y
zS?%%|op9a<M%>Q10kHl8TEAFf9jzIyqroBevTS{s(YnXI8mk3wz5+CEkWRb*qpA}_
zmmReb&(Fq6uGE)pW9f0N?cSu8XxlT;IvC9!l#4@n3y+U#sWw<jPGqz!t~%RtS~k~e
z07N_rV<IQw!y=wei>ycp`E3l}i!t$jh9dNDh&n2N+bMr1Q4rZn*}KaSXQKqwi1(D;
zPf(#m9^f1fR7i^0uV>1?$0*C9PSK=1@pS_KlBgo?e2=pwQ|urygkb4sSX5MqTq4%h
zl1N4SS5pOj<%-Tx_VqGUs+2M?eaQ4^Nd)I+Q(}WVN+K2R{VlPcFNsvRXEJG9zzP4P
zBUHh2!b9<bE$%FRmp3XoVY=b{Ei>*a?Q@xNx8VIPGhQorAIyw<gx&s;6TX1(zLQzL
zUhw{v8E+`<bD8m3g5TZE_-xTm?-FL*TUzg%@y61=p&9pylclFknFAwOoVCU0h~~Ob
zS&(v0SZVV<q><@_uQ$B^XvXK4&u<tQPFN}PcV`0(uQRY4=sRG9WZnt8eSTZKMJ%<&
zc~{hI$|)9#<-RA)Ae5Y?S;owAikr;<m7L<1DtKuHWa2x8-&Do|-ZwSk-J+F&FA~Dx
z6}<Pt{H;~+DB-+k#`5_p_=zg`Usb`+RKfB63;nd#`>QJWENrli{4&CM9|P4fZ6uu6
zyE-ZazJ1x*$ydQ2uY!NK3NB|(8$Um+BEJaU-|hU_PzB#dxV62<3AeWQX~6Bq$#+Pe
z*WetV7YXNe3gd4QZtbu4tKjpU_U-DZg72z=r>o$PRl!eI!GA!wwY_f<{-9+ZQ)+bo
zt+=0XYkW2WZWreW;0`#9Hf5QI?;!k?1>Z-waECbbl4b!{;A;-gS2&z7Js|PQ`SWSA
zQ~G|Si;M3_`O10tRlx21uhy<Nq@6k9?)nf}E)zRDS3n+jTCMZ^uce*JdEVvH<*m<)
zW{FomcUA$e*v$W7z~^B6seh$9nEonc{8)u0JtOhT=fybL3Asa3V=?JX0Pch53-=3~
z`wGfCDxatKN;@o7reCXq|25gSKEJ*PxWBAk%AEte$>9^vTE_FgAP%1Lb6j=@Xk7vv
z<HPrbMwWg-xN)V9tm}juw+Ij)gaPS;@wwdcJbfSKVLs&f3W_p4OLllZX(PM|7O!Z>
z8qb>nM?Z~gbyT?(@FqudMZE#Q{bbWfu1bGw-j~|)*$f_9Y=f^7ZE#vqUEA53(MHCi
zIsF7<TUT2kQ2ql+INlh`<br{;2+&swHIahPU@>c1)lwr?EHIYNSuvEe!vlF(^d8ib
zM{E_UF$40XV1B|@sR}Qk9ZuN0WoCeSE)i2RS~49ow%3fC+im8V+iB)i#=*?DiHBK0
z8mNqmS*S8TX1+86P$p-Lp;^WfFEdjbFvm?DN$Nq1<}^Hyi9wu5Xu2!qj;bbx<48-!
zTjelI?<y+*STXj>7&>AnYwiQHd^Mg;>@A9waW4y0#=Ok4k9V1GiFKJ_7iayfY%H7C
z!y>z}90EE@TVk8yV_?Q1&=q}CHZ9qx3)4V0mygHcRwy6ARdeI;fe{YjX5j$tNLn3D
zriY_RbtIS0WYuW?u)tl6q?Xf00_!@~ciIc7Sce+TWTHo)nYqjnp??q@$&Zg80Tn|@
zw$vnA+Fw!C-kssCJ!;RkEh?P<HA{{Nb<5su;jR6f%^W>ustV@_)Sd`gh-}#@)V_i3
z-QfXsdvEWqo<Vgm+&$2PR^{<zj4As3u?Lc2vqshMZ~&cc4uwodnXLz%t!&iPPcuWU
z=A+H(NH(pGMN=bq?74qCNR1>?YCfw$E6Wbz{Uf)FVDqTTy$0gL*(}u}ca&5W4pnmq
z@wB!55$Ztp$apR~EMVMZbUMa4`3)E5;}m>c4vePq0odT0&@wrg*CiD0?r+USM}_2#
z;iu`q$dMG-(`hcFbLaqush|Q6X5%O-P;Md#Gr4SOAO~kIL_l^+Ad{AX4`^dFc*aIR
zmrJ^aK8kb=&V%9Tcmkenqxj&0pMb1DHwK{+c;aIm#;ESsa6XY72}E<bOky|>n|9Cy
zZ8$#)LoW``OFbVQ9?ocoxP<$<oQH^U7M?K1=l7@0<GCkzMZwP{GV#6<|33@w6B1j>
zavtdtxN6~IecoT<{}=HC&zL;M0_4}iWz?T`gaE<+6Op9iU4`GzbgmE)$l$vZ*U$S^
z9$ddb`jSpcXAQW1wzCEJ*WoM^>vuUrQlpFXx&83kBopgXQEY<@?Dd!G=l!oC(w`zh
zY$L{*_4%9??f{}F*U$T8)1)u=8-ULVZ5R`7r{dWx*601S{nWr&B+UM^9Mglq$Fp8s
z=6$yb()W|T)qjG;eUL#>*601XDbhbcf?U6~{l`hao8+$5gk<g->2sV=AJ4Q|^&bTc
zV^Xs33PB0v=!Kfk^+^zZ$SfB{0dhwDcG7PreSb+(&-kd!6wfjn_4&Ph2kG}vKe8Ft
z<MN+d^!XgYx2U1~-I(RA{y$0j9DiQNoFsi~`#FAW@1#Ya_bvJVsPKcKF<I;XE-<nE
zTtDw`HrMG|5o8*Ywf<AU^TNep;&m<mUzJs!%S>MaT|Aq>`n-;wp}?^mh{(icrauN0
z#}4b4|1VCLzexV$_{YS0+>dVph9a!b`>z+#DM)_#TmE5vrauFgQNOfLOBMVk{dX+-
zydL(GzO}!wE@S#76wnv;pVya5KcW5ypo=!!;W8%H|92w@d9I(S6EYlI6y<&78alW&
zi^dU~v9_lL^w}26AobB!bqx<pVp?T!pVW$>=scSE|C#idSY`2JkrhMPpOSve!n72{
zCepXYua~yMFLA|0S)b`0q;C-u%W2E}A6U?$&+^E&gh;Zk`E`l0dY#@1WC$)Ew>&Rn
l_wd-!CH3E-RXUCE#7s<!JMw&<o&J+}#sboAi-M&n{s*)#`Lh53

literal 0
HcmV?d00001

diff --git a/llvm/test/tools/llvm-profgen/Inputs/recursion-compression-noprobe.perfscript b/llvm/test/tools/llvm-profgen/Inputs/recursion-compression-noprobe.perfscript
new file mode 100644
index 000000000000..3ec8f44cfef0
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/Inputs/recursion-compression-noprobe.perfscript
@@ -0,0 +1,4 @@
+PERF_RECORD_MMAP2 3019402/3019402: [0x400000(0x1000) @ 0 00:1d 265650677 1451231]: r-xp recursion-compression-noprobe.perfbin
+
+	          4007e1
+ 0x4007d6/0x4007e1/P/-/-/0  0x4007c7/0x4007c0/P/-/-/0  0x4007c7/0x4007c0/P/-/-/0  0x4007c7/0x4007c0/P/-/-/0  0x4007c7/0x4007c0/P/-/-/0  0x4007c7/0x4007c0/P/-/-/0  0x4007c7/0x4007c0/P/-/-/0  0x4007c7/0x4007c0/P/-/-/0  0x4007c7/0x4007c0/P/-/-/0  0x4007c7/0x4007c0/P/-/-/0  0x4007c7/0x4007c0/P/-/-/0  0x400795/0x4007b0/P/-/-/0  0x40079c/0x400790/P/-/-/0  0x400801/0x400770/P/-/-/0  0x400698/0x400801/P/-/-/0  0x400673/0x400696/P/-/-/0
diff --git a/llvm/test/tools/llvm-profgen/Inputs/recursion-compression-pseudoprobe.perfbin b/llvm/test/tools/llvm-profgen/Inputs/recursion-compression-pseudoprobe.perfbin
new file mode 100755
index 0000000000000000000000000000000000000000..a3dbda2f0b3ecf6905368f3fe3b81f9c34adba7c
GIT binary patch
literal 13584
zcmeHNdvF`adEYzWK#HV5>P<#<bxx(2lueMJNJ*4pTcAV{97B)jt;W(Bj{^w{1ZX@|
z%A<@~(N2e{m8w&>NjgpJKaz>YX<B#ErqkLP$)(25<TY{Xw65JYY9qz;qY0hFs_MjI
zzi;pRfWrZ2$<8F{A6*Ie?f$;U?tc5--tKZA7>sOJ6-97T#T^1mPb}9Zv_(FJIJ1^A
zK(>gDq7j~JMZ54p*())NzeVUW!*o=&aHjQEQFwr&-dW<GRdvZ>`Wvz&1XFGyQq(Ia
zow64FEH(%dQ_*6TQBQ;46U68KF+D@=U~1Pxl`RV_J6k$*nPGaK8phPA2adARW=l^H
zZ6~;cDC=P}u)f_?-)^dp=^3hzDf@HJ<w8P^!}Z!p{U<YGH|aH@b09TodIgQhk`PSU
z_a6d1GDQ+MMo=Gr7s%fiXxy3VWQHl%*OgAi*7tU$6P@W)rZ~|#v2lIp`rbf38|abt
zaG0@ALp%3N3m9Kig)F;0P-k%s<`Tg48u<HN@H<@aLx9(^kF8VWKZ_b5N1-Z;?@Pfb
zsv-p#Cu}GB8|nKPB#JnX1^Nqz68ny_wG#S9Cc^-)#*KU-nkyJ%(Nsnl#>iMULx>@a
z;e9)dgqbr(Qu%_J+qYwDI-4=~MPq5x;??Ag=TezMQlM=dp!H%XGCZ)==n3=$`nc+g
zYjv6BJ~2LP!|mfYXTwYF2c@evoMQ!T>GYn}L7qYzVepCFsu20&n4Ze(YZ48>Tj|*x
z1{r*(KD$VW1kmS!o&@>`4B{g|{~G91KraEk1avh7N)tFb0kjwB^FU7mT?#x*%KdwU
zGSQ-}Y^tlrcNTG+(>8+LE)>{SCx;#JUij?>+}_^RfZ@A$6>vEhhWsrbRktqm?Er(J
zCSS`iP~f9n0e*8JcMDL#atBE6uCm;p0UPc0+vT=Sc|Y#C&k_P0?OlLh3-ohym0Z3g
zw+(o&!>=9WOqAR1Z}|gt(BJxT&!AsF?H%;DKdgoQ-5>RZ{Trw1hWuMH{*57jcgWv9
z;MWKItpom+0e`*p3;J(2*s=E?P1@5>nt$`H;C~HIKX)`d{h#5fOLM#TMNVtWVNk=T
z7d{FG!l!RP1Y~}h3NHQW8I+&-MmYg{5%ev5T5CYkf3>hI{QKGuVAR47oi8kyzu3HL
z66>2G3iAK)TPWZ6B9e}4;is>8!qao%r)TdBE6;?VyH;2V2EIrJ>gO*en^%ETSRd=3
zy!|VnD~e0@ho>*k-akJ-|BUtpK$IghXFv|5lY!<{4`Q_f<zM?)xxJ_|HQ#u0!E-;F
zodeAyGgHsb&*EEi1XQ;O<WFCnnpaQy&nm^Hsq^#G&(6MV3CjGFG7pWi?4x|>z}_dp
zMFRDwzc>4LAQ4iYnVM04_h*YBc8af^6PhlzO<xI~pT*ezaOfyjHVG=SK8OB^chdg`
z=mgJ83$jz#ztF+q>3<F#7@oco+8>&JI|L)R<|%j!_J>b*BHbJ5xE!8(y?%BGl*3QI
z?J2AZzC`s$rr(N8U)eVOqtN`)e+@&M!~OqOd^J3Eq5i{1LU)IbgpP)cnKPCCoB=af
zFWQ8$dILfq2V0&0HtNlkd9nn;wlv@O>dg>MYhH&Z`fGj-mPb+Im5b2vdsgUD`V3KU
zmrOXHV_IgB(i7(+OcP#RQhawY-6Nku(C1sUQh5c@7f7CGFXnrQUr+NL&Lfz(9{w)O
zs=CA*<x`aG^SkzQkHv>+P^Kc$H?PfSF?*KyUcxU>xsA%<4qfI}Qknb3-zr`&$vF&U
znBPS5v#q+!Eu^xy%_>98%lap%zH3x|kjgLwlFxo8^><BpedSbKPwg1mx^<J@4y%g}
zePf_E5Y&6Rd%C;(dp7Frd(4C$jutGuGuR>G>1bv|KW^soscc3M_5`{E-FkaIKf0-_
zYa~?&98Zl0lF_&s%Vv)S;@PpTM6?h^PR<<9=DV`_F%!z>1blHb(R5cjeSEBQJeR%K
zj2H9`ebK&TaNWk9zG(0I-dH@Cj0I!KzT^fo*dOod-Vp7L$Jeb7_H~FzDs#-tZPFu=
zZNIiQRCsNj^jfe#80-n!y#{zEv{wj>J^3+&!5j*$R@RO7_pAf#GVMi}{=#w;YnO>u
z*^A#`z3*1;diCaSu-<nocfESe8?5);$~o%AcfMkLqruWX^LsSLwPcvbv}(M@t@bmY
z=_;sCCM%w_TrrbL!J9b}kHgDq4@V)Kve8T?8%t$s%3}RxyK9Gb9TaN^0pPh77v4jm
zwJORqOF?)@>{eK6$B!8>LF7k;fHi}aEctlSfVaE=Z#hgGI6?AUNcOhx+OuzX`(ZJU
zO3O)=?8+7Pp+iP!@7_U>+O=~p@Ez@`!*2!8_m<wx=x(R2>{zN`By-U*(-_afO5V(Y
z4A0Zl79T9r#3Df9TWTfQhnubb<;-Y0HIhMw+gM$6ywgtq|NmJ1>c@)X4UdB$erv{?
z#k;j;yw!SldCq29ONPJw*J!?|CBy51Ci))Nl9{8J`V(4<xy9bNV_y1o#~!|yzl3>0
z&Oh&Eo<PN4^*rGcdlh|nr`Y>N^=riwuMe2MR~e3L&pX5}$MqtT>HF4AJD<3c{N^Js
z?wQ4JrkadR|KK}cni^U)bJMN6nsFJeCZm7;mDZOc|8Vv0@I(FIL@7@)n|<rMr(a&5
z_~VygWh}oGUU{$sOiaA|SHynK;!KXoD}JKvf0RqSZvjEh^3vTBw3NzEow6Sw3W}WV
zDM7pCQ34e4xD1mbc3YY9^KQ!i9#KUcEi<j$K>}FR1D~Ns?<MddqKd$8U`2YIO3xCl
zA)+i&*7%ob9)HV<mHtJ_V)%xt0RR{KwzPg?pv~jo)VR6vmbT#~cea7MHKDGspker!
zd#6t+BDiD`OrM%fCkQNGFGymh4rCNdrv76fuD1T+Kr3wf0*Ynl&NdwerlD7q%{H!v
z!X1r`fH&F9vH&(@;Z4%!V&9?0!;J?UzdQiV_CuAeSfxlngRFg9y@GRHQ)#w23#G?s
zw){HeRW9IID+fM@PwxUym)lHeCl$SGA^QX=ddHyf_f!z-dCJ{_(g)xnlm*g646@RM
zEK03^VH0TYGMmumDA$k52VRAz?+ftwF=oC6kGJ7hhqczxOkS_-SMyL_fbCS8l?L2K
zD=k=U1MY;atUwnE&D+#XY8Z{m0o<Z0%TabAR<(tgs}Ww|St0aBzj9R8?E@LC+p2uX
zQMpJ)1-yT7Y?~LbY=R#gRhxmh$O!og{@>UZCS#&&G&^Q?{Z86Ubsda!?u+J+$vq}4
zK9eJ6rVDnY<2f^ro7b*_nJ;wOfX?x}SxjU>G-e9)mRIQTR21cB{)I@uL%2j#%Do2w
z2Bs8Q%e4B2yELEggIZfaTLz=?my?r+1DfLdZLPUeYXIPH0RXn*JFJOOkRFlJ^}a5k
z0j(J*N<GU`MXl{NZ7Cq<7&)$qzmPI$y5UW&&R5VHeVa8uSoI{ihHg!{tcj<}f<JT<
zd_}E;uJK$8dGX&Apz5UXOfJw)sp@3Cce2rUs(~lz2BA!T{sGUVrcHY5Pig97nr~8g
zC)GO6^8^p(CeJ;fg5W9DJE?glRo^KOWOy0}mrlMdR9o$nqPkk?<XR6L510!C?}=E(
zTDMo@Ye=5gYb@XIf`7;bzt;t~W_5pQPtW!rb&-G41^>1S{tDr|zmP?E$iM_YIQw;z
zbWk<k?}8t8!S8p$@kko|@2vNn3;ul<-1@UcwVj*g3dt{={S6cDtoI+`L!hc<d0yv@
zfxK6|Ay*9SGo%xMD`JJ<^F`)81h@+Q;x#wpe@eK{lQ}WINccStd`8-D6YX?<DZ39k
z-;wxY!RLl-|9{91pSLou!OX9~j+)OQ8UGS=P*EGjRj<yFq^m)`R)0J33j+P(JpwC*
z2<NpR<A+`F`&{tTF8Jdv_}5+VmtF8W`Jsn6p^p={Q+L5{C*0ZJy9xiU!_Em8{F5&B
z-QwzRKwg0pjzgNxO_cOLIbPNAwip9dZLNyK=OkV|?w1I6p6k9T@uk9vn}Dlov*5ib
zH|m(gTLkYv8J~gnFNQd|Nd7^ym|i2C?-g`eu=p|I>}Qj36%^5qeMy3>I>2?Y9RB#G
zlcYu30bc~m_&KkIgISq2NIO)pkzv4FYS=jdxNei<>_=SiPr2ad0mt^Ar}r9HeVAql
ze~fV6JUm0VeTj$iUz0fI94R1ApvFZY14m<l(7<qKAvz)gi4&Rpi7^ov$rJ;0(q2(8
za#nE^Puc?36LkuJ=gf2z4Nz`8U4UE`58*|?9HpbL(F90xNdyXT950|PDR?G<h=5eX
ztP#!Sq9-gS=fQe(EER_~WI@3)T^NIdKlt;xQILN>N0wzImd^|PJHVI;t47(QSTU7O
z1fqpPE)^@n8Z0o1X0WeSoFc3;kC|zxzb0G$ZNCWatI0Jdus^j}&_B7-!z0O<33mYU
z(3yl8D~=fHY$+R!#d79xW~4I7Ea#=8Ngh_gmF7eM#tysybQA=woJB=|PF@Z4H7uST
zWy8!QoS5aEN<EaTg$K(1s-fV<43ykhL&=2~u$)^%p^^dWg;d;tpv=bY{jJ0+`(4Sa
z>}Msf+7Ffd8vdvhkOr##QYlpJpGv+o0#L4ip~x<&lyUg0k|_;T`prnBEvH2bCSF)U
zCyu8~t10D5+{7@B;<-Y;P)sJl?S=uDTa3u?-hGB4$~RsNerfZ9ue#VE!$@R}k#sf|
zO&f_qHkUV|#R-AirL<Wv6M+pq{k>qhhLBO7IG}5V+zDYFxF?EZV<$kxR+2A*HLs9p
zX=7RPq?t}uy)R`9omX~98Ly0+GA^AhXB-}>GOtFwl$lboI#$X8)p1hhN#&{-Df1oi
zQD#U*M{HOZo8tMR<rZnS>;k&?RAN{7$Hv4#s*0zwX~{-i!`Qwjv}4d1+_?>+$6-LY
zM4+&4S=q&L%CpW2@QcCxp~$X*P{i1^ef!?Qea61fKx7baX6*ugB9$?Uc@tYueUr;>
z!*Q)kRwQE(G%@h0<?rdb7X)}dYm7!S3Ah!8XCN_o3_`cS414@q<4^%f|7|<GGqhuP
ztK-xEUW&MXz#nWAWa9gze9x5EA;gxloJWcWM|fhy`g|{u?<?xSvnO7EF}4CSc*?~3
zeD7kGfHNeBXUW)3q?qtv3=^Ku;hhH7=X(zma=l;vI~)~D^|1VHppSPXxXkw?9wY@F
z_?WnUXZ!I;?21jF?|qylefA&M&+V5+;V}q5Sjv^!#rH%mkite1WPPrm>0Q7_9=D(G
zrTm-}_+AQ`sT#k%z(8B9&-ZL5sDgV)k?ZI7v;U8f{s77Gy_*+EpXE>=^G^K{z;Imo
z%$wKgbELl#c=p8ljNxS#d;FH~<606C79>h$OXx=+XV>Tb)gc@dND*Y(lCyq1pFt5g
zK$eO3F??^jj~z>V{HudKorZ#4pZ8O<RFSj)&i4P2L!b9uUVJej@!xr@k~P+xNBSxF
z+3VNabcBUY`c_Fo^5Ja%X8^^&#c_P_`2*kocKVCUOdkhbzfGUdBf?Za%YleYTxR-Z
zK+*SHKkrldKDhJVH|udf&VerWpY{2k?ZpNFA?fm2ey~0x7a(KTFYP~ZQiJ3iFO;>X
zuR+1C&*w(<q;HqSytDonucNP{k}Wy)zYjdbxc_{v)K2v~+v_aPf-Yj+@Uth@k2@5&
z$n*_R!taMOJxmuUCL9c>KO4bNEqyNq{`K_Ny6B(0LH)3c{!edEAN^CS{b_tDAl>F*
zI{SYD^x=P1`Xv1oa+ibSlzrGm|7Ftea40y6;u9|VE%;%F^t?q%PpAIpT=XxI{-{I2
zQ51hg`p)=^v{_Zom1?#!tk3i*(szi758_|BApOuHr6=n#@KqQ6wKQ>mngxkE_5TR~
z$0tHcewFlJpbtw2pqt>w?=OB!G5tH}^Sf&y*7LFVTXfD*p`$2fUG&eaw)C&jcaUv6
G_5T+oAnot~

literal 0
HcmV?d00001

diff --git a/llvm/test/tools/llvm-profgen/Inputs/recursion-compression-pseudoprobe.perfscript b/llvm/test/tools/llvm-profgen/Inputs/recursion-compression-pseudoprobe.perfscript
new file mode 100644
index 000000000000..91a69e2c9dd0
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/Inputs/recursion-compression-pseudoprobe.perfscript
@@ -0,0 +1,23 @@
+PERF_RECORD_MMAP2 3367317/3367317: [0x201000(0x1000) @ 0 00:1d 238458915 1121070]: r-xp recursion-compression-pseudoprobe.perfbin
+
+	          2017db
+	          2017ba
+	          2017e5
+	          2017ba
+	          2017e5
+	          2017d9
+	          2017ba
+	          2017b0
+	          2017b0
+	          2017b0
+	          2017b0
+	          2017b0
+	          2017b0
+	          2017b0
+	          2017b0
+	          2017e5
+	          2017d9
+	          201847
+	    7fcb072a67c3
+	5541f689495641d7
+ 0x2017cd/0x2017db/P/-/-/0  0x2017b5/0x2017c0/P/-/-/0  0x2017a7/0x2017b2/P/-/-/0  0x2017e0/0x2017a0/P/-/-/0  0x2017cd/0x2017db/P/-/-/0  0x2017b5/0x2017c0/P/-/-/0  0x2017a7/0x2017b2/P/-/-/0  0x2017e0/0x2017a0/P/-/-/0  0x2017cd/0x2017db/P/-/-/0  0x2017d4/0x2017c0/P/-/-/0  0x2017b5/0x2017c0/P/-/-/0  0x2017a7/0x2017b2/P/-/-/0  0x2017ab/0x2017a0/P/-/-/0  0x2017ab/0x2017a0/P/-/-/0  0x2017ab/0x2017a0/P/-/-/0  0x2017ab/0x2017a0/P/-/-/0
diff --git a/llvm/test/tools/llvm-profgen/recursion-compression-noprobe.test b/llvm/test/tools/llvm-profgen/recursion-compression-noprobe.test
new file mode 100644
index 000000000000..47e0a51a4261
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/recursion-compression-noprobe.test
@@ -0,0 +1,65 @@
+; Firstly test uncompression(--compress-recursion=0)
+; RUN: llvm-profgen --perfscript=%S/Inputs/recursion-compression-noprobe.perfscript --binary=%S/Inputs/recursion-compression-noprobe.perfbin --output=%t --compress-recursion=0
+; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-UNCOMPRESS
+; RUN: llvm-profgen --perfscript=%S/Inputs/recursion-compression-noprobe.perfscript --binary=%S/Inputs/recursion-compression-noprobe.perfbin --output=%t
+; RUN: FileCheck %s --input-file %t
+
+; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa]:14:0
+; CHECK-UNCOMPRESS: 1: 1
+; CHECK-UNCOMPRESS: 2: 13 fb:11
+; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa:2 @ fb]:12:0
+; CHECK-UNCOMPRESS: 1: 11
+; CHECK-UNCOMPRESS: 2: 1 fa:1
+; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa:2 @ fb:2 @ fa]:3:0
+; CHECK-UNCOMPRESS: 1: 1
+; CHECK-UNCOMPRESS: 2: 2 fb:1
+; CHECK-UNCOMPRESS:[main:1 @ foo]:3:0
+; CHECK-UNCOMPRESS: 2: 1
+; CHECK-UNCOMPRESS: 3: 2 fa:1
+; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa:2 @ fb:2 @ fa:2 @ fb:2 @ fa]:1:0
+; CHECK-UNCOMPRESS: 4: 1
+; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa:2 @ fb:2 @ fa:2 @ fb]:1:0
+; CHECK-UNCOMPRESS: 2: 1 fa:1
+
+; CHECK: [main:1 @ foo:3 @ fa]:14:0
+; CHECK:  1: 1
+; CHECK:  2: 13 fb:11
+; CHECK: [main:1 @ foo:3 @ fa:2 @ fb]:12:0
+; CHECK:  1: 11
+; CHECK:  2: 1 fa:1
+; CHECK: [main:1 @ foo:3 @ fa:2 @ fb:2 @ fa]:4:0
+; CHECK:  1: 1
+; CHECK:  2: 2 fb:1
+; CHECK:  4: 1
+; CHECK: [main:1 @ foo]:3:0
+; CHECK:  2: 1
+; CHECK:  3: 2 fa:1
+; CHECK: [main:1 @ foo:3 @ fa:2 @ fb:2 @ fa:2 @ fb]:0:0
+
+
+; original code:
+; clang -O3 -g test.c -o a.out
+#include <stdio.h>
+
+int fb(int n) {
+  if(n > 10) return fb(n / 2);
+  return fa(n - 1);
+}
+
+int fa(int n) {
+  if(n < 2) return n;
+  if(n % 2) return fb(n - 1);
+  return fa(n - 1);
+}
+
+void foo() {
+  int s, i = 0;
+  while (i++ < 10000)
+    s += fa(i);
+  printf("sum is %d\n", s);
+}
+
+int main() {
+  foo();
+  return 0;
+}
diff --git a/llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test b/llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test
new file mode 100644
index 000000000000..86afe6c632bd
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test
@@ -0,0 +1,169 @@
+; Firstly test uncompression(--compress-recursion=0)
+; RUN: llvm-profgen --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=0
+; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-UNCOMPRESS
+; RUN: llvm-profgen --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --show-unwinder-output | FileCheck %s --check-prefix=CHECK-UNWINDER
+; RUN: FileCheck %s --input-file %t
+
+; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:6 @ fa:8 @ fa:7 @ fb:6 @ fa]:4:1
+; CHECK-UNCOMPRESS:  1: 1
+; CHECK-UNCOMPRESS:  3: 1
+; CHECK-UNCOMPRESS:  4: 1
+; CHECK-UNCOMPRESS:  7: 1 fb:1
+; CHECK-UNCOMPRESS:  !CFGChecksum: 120515930909
+; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:6 @ fa:8 @ fa]:4:1
+; CHECK-UNCOMPRESS:  1: 1
+; CHECK-UNCOMPRESS:  3: 1
+; CHECK-UNCOMPRESS:  4: 1
+; CHECK-UNCOMPRESS:  7: 1 fb:1
+; CHECK-UNCOMPRESS:  !CFGChecksum: 120515930909
+; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:6 @ fa]:4:1
+; CHECK-UNCOMPRESS:  1: 1
+; CHECK-UNCOMPRESS:  3: 1
+; CHECK-UNCOMPRESS:  5: 1
+; CHECK-UNCOMPRESS:  8: 1 fa:1
+; CHECK-UNCOMPRESS:  !CFGChecksum: 120515930909
+; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:6 @ fa:8 @ fa:7 @ fb:6 @ fa:7 @ fb]:3:1
+; CHECK-UNCOMPRESS:  1: 1
+; CHECK-UNCOMPRESS:  3: 1
+; CHECK-UNCOMPRESS:  6: 1 fa:1
+; CHECK-UNCOMPRESS:  !CFGChecksum: 72617220756
+; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:6 @ fa:8 @ fa:7 @ fb]:3:1
+; CHECK-UNCOMPRESS:  1: 1
+; CHECK-UNCOMPRESS:  3: 1
+; CHECK-UNCOMPRESS:  6: 1 fa:1
+; CHECK-UNCOMPRESS:  !CFGChecksum: 72617220756
+; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb]:3:1
+; CHECK-UNCOMPRESS:  1: 1
+; CHECK-UNCOMPRESS:  3: 1
+; CHECK-UNCOMPRESS:  6: 1 fa:1
+; CHECK-UNCOMPRESS:  !CFGChecksum: 72617220756
+; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb]:3:1
+; CHECK-UNCOMPRESS:  1: 1
+; CHECK-UNCOMPRESS:  2: 1
+; CHECK-UNCOMPRESS:  5: 1 fb:1
+; CHECK-UNCOMPRESS:  !CFGChecksum: 72617220756
+; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb]:3:1
+; CHECK-UNCOMPRESS:  1: 1
+; CHECK-UNCOMPRESS:  2: 1
+; CHECK-UNCOMPRESS:  5: 1 fb:1
+; CHECK-UNCOMPRESS:  !CFGChecksum: 72617220756
+; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb]:3:1
+; CHECK-UNCOMPRESS:  1: 1
+; CHECK-UNCOMPRESS:  2: 1
+; CHECK-UNCOMPRESS:  5: 1 fb:1
+; CHECK-UNCOMPRESS:  !CFGChecksum: 72617220756
+; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:6 @ fa:8 @ fa:7 @ fb:6 @ fa:7 @ fb:6 @ fa]:2:1
+; CHECK-UNCOMPRESS:  1: 1
+; CHECK-UNCOMPRESS:  3: 1
+; CHECK-UNCOMPRESS:  !CFGChecksum: 120515930909
+; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb]:1:0
+; CHECK-UNCOMPRESS:  5: 1 fb:1
+; CHECK-UNCOMPRESS:  !CFGChecksum: 72617220756
+
+
+; CHECK: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb]:13:4
+; CHECK:  1: 4
+; CHECK:  2: 3
+; CHECK:  3: 1
+; CEHCK:  5: 4 fb:4
+; CHECK:  6: 1 fa:1
+; CHECK !CFGChecksum: 72617220756
+; CHECK: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:6 @ fa:8 @ fa:7 @ fb:6 @ fa]:6:2
+; CHECK:  1: 2
+; CHECK:  3: 2
+; CHECK:  4: 1
+; CHECK:  7: 1 fb:1
+; CHECK:  !CFGChecksum: 120515930909
+; CHECK: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:6 @ fa:8 @ fa]:4:1
+; CHECK:  1: 1
+; CHECK:  3: 1
+; CHECK:  4: 1
+; CHECK:  7: 1 fb:1
+; CHECK:  !CFGChecksum: 120515930909
+; CHECK: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:6 @ fa]:4:1
+; CHECK:  1: 1
+; CHECK:  3: 1
+; CHECK:  5: 1
+; CHECK:  8: 1 fa:1
+; CHECK:  !CFGChecksum: 120515930909
+; CHECK: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:6 @ fa:8 @ fa:7 @ fb:6 @ fa:7 @ fb]:3:1
+; CHECK:  1: 1
+; CHECK:  3: 1
+; CHECK:  6: 1 fa:1
+; CHECK:  !CFGChecksum: 72617220756
+; CHECK: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:6 @ fa:8 @ fa:7 @ fb]:3:1
+; CHECK:  1: 1
+; CHECK:  3: 1
+; CHECK:  6: 1 fa:1
+; CHECK:  !CFGChecksum: 72617220756
+
+
+; CHECK-UNWINDER: Binary(recursion-compression-pseudoprobe.perfbin)'s Range Counter:
+; CHECK-UNWINDER: main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5
+; CHECK-UNWINDER:   (7a0, 7a7): 1
+; CHECK-UNWINDER:   (7a0, 7ab): 3
+; CHECK-UNWINDER:   (7b2, 7b5): 1
+; CHECK-UNWINDER: main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:6
+; CHECK-UNWINDER:   (7c0, 7d4): 1
+; CHECK-UNWINDER: main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:6 @ fa:8
+; CHECK-UNWINDER:   (7c0, 7cd): 1
+; CHECK-UNWINDER:   (7db, 7e0): 1
+; CHECK-UNWINDER: main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:6 @ fa:8 @ fa:7
+; CHECK-UNWINDER:   (7a0, 7a7): 1
+; CHECK-UNWINDER:   (7b2, 7b5): 1
+; CHECK-UNWINDER: main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:6 @ fa:8 @ fa:7 @ fb:6
+; CHECK-UNWINDER:   (7c0, 7cd): 2
+; CHECK-UNWINDER:   (7db, 7e0): 1
+; CHECK-UNWINDER: main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:6 @ fa:8 @ fa:7 @ fb:6 @ fa:7
+; CHECK-UNWINDER:   (7a0, 7a7): 1
+; CHECK-UNWINDER:   (7b2, 7b5): 1
+
+; CHECK-UNWINDER: Binary(recursion-compression-pseudoprobe.perfbin)'s Branch Counter:
+; CHECK-UNWINDER: main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5
+; CHECK-UNWINDER:   (7a7, 7b2): 1
+; CHECK-UNWINDER:   (7ab, 7a0): 4
+; CHECK-UNWINDER:   (7b5, 7c0): 1
+; CHECK-UNWINDER: main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:6
+; CHECK-UNWINDER:   (7d4, 7c0): 1
+; CHECK-UNWINDER: main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:6 @ fa:8
+; CHECK-UNWINDER:   (7cd, 7db): 1
+; CHECK-UNWINDER:   (7e0, 7a0): 1
+; CHECK-UNWINDER: main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:6 @ fa:8 @ fa:7
+; CHECK-UNWINDER:   (7a7, 7b2): 1
+; CHECK-UNWINDER:   (7b5, 7c0): 1
+; CHECK-UNWINDER: main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:6 @ fa:8 @ fa:7 @ fb:6
+; CHECK-UNWINDER:   (7cd, 7db): 2
+; CHECK-UNWINDER:   (7e0, 7a0): 1
+; CHECK-UNWINDER: main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:6 @ fa:8 @ fa:7 @ fb:6 @ fa:7
+; CHECK-UNWINDER:   (7a7, 7b2): 1
+; CHECK-UNWINDER:   (7b5, 7c0): 1
+
+
+; clang -O3 -fexperimental-new-pass-manager -fuse-ld=lld -fpseudo-probe-for-profiling
+; -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -Xclang -mdisable-tail-calls
+; -g test.c  -o a.out
+
+#include <stdio.h>
+
+int fb(int n) {
+  if(n > 10) return fb(n / 2);
+  return fa(n - 1);
+}
+
+int fa(int n) {
+  if(n < 2) return n;
+  if(n % 2) return fb(n - 1);
+  return fa(n - 1);
+}
+
+void foo() {
+  int s, i = 0;
+  while (i++ < 10000)
+    s += fa(i);
+  printf("sum is %d\n", s);
+}
+
+int main() {
+  foo();
+  return 0;
+}
diff --git a/llvm/tools/llvm-profgen/PerfReader.cpp b/llvm/tools/llvm-profgen/PerfReader.cpp
index 64a502be59a9..d05c665f8583 100644
--- a/llvm/tools/llvm-profgen/PerfReader.cpp
+++ b/llvm/tools/llvm-profgen/PerfReader.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 #include "PerfReader.h"
+#include "ProfileGenerator.h"
 
 static cl::opt<bool> ShowMmapEvents("show-mmap-events", cl::ReallyHidden,
                                     cl::init(false), cl::ZeroOrMore,
@@ -124,6 +125,8 @@ VirtualUnwinder::getOrCreateCounterForProbe(const ProfiledBinary *Binary,
       ProbeBasedKey->Probes.emplace_back(CallProbe);
     }
   }
+  CSProfileGenerator::compressRecursionContext<const PseudoProbe *>(
+      ProbeBasedKey->Probes);
   ProbeBasedKey->genHashCode();
   Hashable<ContextKey> ContextId(ProbeBasedKey);
   auto Ret = CtxCounterMap->emplace(ContextId, SampleCounter());
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
index ce228a781538..f769bd592f87 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -22,12 +22,22 @@ static cl::opt<SampleProfileFormat> OutputFormat(
         clEnumValN(SPF_GCC, "gcc",
                    "GCC encoding (only meaningful for -sample)")));
 
+static cl::opt<int32_t, true> RecursionCompression(
+    "compress-recursion",
+    cl::desc("Compressing recursion by deduplicating adjacent frame "
+             "sequences up to the specified size. -1 means no size limit."),
+    cl::Hidden,
+    cl::location(llvm::sampleprof::CSProfileGenerator::MaxCompressionSize));
+
 using namespace llvm;
 using namespace sampleprof;
 
 namespace llvm {
 namespace sampleprof {
 
+// Initialize the MaxCompressionSize to -1 which means no size limit
+int32_t CSProfileGenerator::MaxCompressionSize = -1;
+
 static bool
 usePseudoProbes(const BinarySampleCounterMap &BinarySampleCounters) {
   return BinarySampleCounters.size() &&
@@ -319,26 +329,16 @@ void CSProfileGenerator::populateInferredFunctionSamples() {
   }
 }
 
-// Helper function to extract context prefix
-// PrefixContextId is the context id string except for the leaf probe's
-// context, the final ContextId will be:
-// ContextId =  PrefixContextId + LeafContextId;
-// Remind that the string in ContextStrStack is in callee-caller order
-// So process the string vector reversely
-static std::string
-extractPrefixContextId(const SmallVector<const PseudoProbe *, 16> &Probes,
-                       ProfiledBinary *Binary) {
-  SmallVector<std::string, 16> ContextStrStack;
+// Helper function to extract context prefix string stack
+// Extract context stack for reusing, leaf context stack will
+// be added compressed while looking up function profile
+static void
+extractPrefixContextStack(SmallVectorImpl<std::string> &ContextStrStack,
+                          const SmallVectorImpl<const PseudoProbe *> &Probes,
+                          ProfiledBinary *Binary) {
   for (const auto *P : Probes) {
     Binary->getInlineContextForProbe(P, ContextStrStack, true);
   }
-  std::ostringstream OContextStr;
-  for (auto &CxtStr : ContextStrStack) {
-    if (OContextStr.str().size())
-      OContextStr << " @ ";
-    OContextStr << CxtStr;
-  }
-  return OContextStr.str();
 }
 
 void PseudoProbeCSProfileGenerator::generateProfile() {
@@ -350,15 +350,15 @@ void PseudoProbeCSProfileGenerator::generateProfile() {
     for (const auto &CI : BI.second) {
       const ProbeBasedCtxKey *CtxKey =
           dyn_cast<ProbeBasedCtxKey>(CI.first.getPtr());
-      std::string PrefixContextId =
-          extractPrefixContextId(CtxKey->Probes, Binary);
+      SmallVector<std::string, 16> ContextStrStack;
+      extractPrefixContextStack(ContextStrStack, CtxKey->Probes, Binary);
       // Fill in function body samples from probes, also infer caller's samples
       // from callee's probe
-      populateBodySamplesWithProbes(CI.second.RangeCounter, PrefixContextId,
+      populateBodySamplesWithProbes(CI.second.RangeCounter, ContextStrStack,
                                     Binary);
       // Fill in boundary samples for a call probe
       populateBoundarySamplesWithProbes(CI.second.BranchCounter,
-                                        PrefixContextId, Binary);
+                                        ContextStrStack, Binary);
     }
   }
 }
@@ -403,8 +403,8 @@ void PseudoProbeCSProfileGenerator::extractProbesFromRange(
 }
 
 void PseudoProbeCSProfileGenerator::populateBodySamplesWithProbes(
-    const RangeSample &RangeCounter, StringRef PrefixContextId,
-    ProfiledBinary *Binary) {
+    const RangeSample &RangeCounter,
+    SmallVectorImpl<std::string> &ContextStrStack, ProfiledBinary *Binary) {
   ProbeCounterMap ProbeCounter;
   // Extract the top frame probes by looking up each address among the range in
   // the Address2ProbeMap
@@ -413,7 +413,7 @@ void PseudoProbeCSProfileGenerator::populateBodySamplesWithProbes(
     const PseudoProbe *Probe = PI.first;
     uint64_t Count = PI.second;
     FunctionSamples &FunctionProfile =
-        getFunctionProfileForLeafProbe(PrefixContextId, Probe, Binary);
+        getFunctionProfileForLeafProbe(ContextStrStack, Probe, Binary);
 
     FunctionProfile.addBodySamples(Probe->Index, 0, Count);
     FunctionProfile.addTotalSamples(Count);
@@ -446,8 +446,8 @@ void PseudoProbeCSProfileGenerator::populateBodySamplesWithProbes(
 }
 
 void PseudoProbeCSProfileGenerator::populateBoundarySamplesWithProbes(
-    const BranchSample &BranchCounter, StringRef PrefixContextId,
-    ProfiledBinary *Binary) {
+    const BranchSample &BranchCounter,
+    SmallVectorImpl<std::string> &ContextStrStack, ProfiledBinary *Binary) {
   for (auto BI : BranchCounter) {
     uint64_t SourceOffset = BI.first.first;
     uint64_t TargetOffset = BI.first.second;
@@ -457,7 +457,7 @@ void PseudoProbeCSProfileGenerator::populateBoundarySamplesWithProbes(
     if (CallProbe == nullptr)
       continue;
     FunctionSamples &FunctionProfile =
-        getFunctionProfileForLeafProbe(PrefixContextId, CallProbe, Binary);
+        getFunctionProfileForLeafProbe(ContextStrStack, CallProbe, Binary);
     FunctionProfile.addBodySamples(CallProbe->Index, 0, Count);
     FunctionProfile.addTotalSamples(Count);
     StringRef CalleeName = FunctionSamples::getCanonicalFnName(
@@ -470,25 +470,26 @@ void PseudoProbeCSProfileGenerator::populateBoundarySamplesWithProbes(
 }
 
 FunctionSamples &PseudoProbeCSProfileGenerator::getFunctionProfileForLeafProbe(
-    StringRef PrefixContextId, SmallVector<std::string, 16> &LeafInlinedContext,
+    SmallVectorImpl<std::string> &ContextStrStack,
     const PseudoProbeFuncDesc *LeafFuncDesc) {
-  assert(LeafInlinedContext.size() &&
-         "Profile context must have the leaf frame");
-  std::ostringstream OContextStr;
-  OContextStr << PrefixContextId.str();
+  assert(ContextStrStack.size() && "Profile context must have the leaf frame");
+  // Compress the context string except for the leaf frame
+  std::string LeafFrame = ContextStrStack.back();
+  ContextStrStack.pop_back();
+  CSProfileGenerator::compressRecursionContext(ContextStrStack);
 
-  for (uint32_t I = 0; I < LeafInlinedContext.size() - 1; I++) {
+  std::ostringstream OContextStr;
+  for (uint32_t I = 0; I < ContextStrStack.size(); I++) {
     if (OContextStr.str().size())
       OContextStr << " @ ";
-    OContextStr << LeafInlinedContext[I];
+    OContextStr << ContextStrStack[I];
   }
   // For leaf inlined context with the top frame, we should strip off the top
   // frame's probe id, like:
   // Inlined stack: [foo:1, bar:2], the ContextId will be "foo:1 @ bar"
   if (OContextStr.str().size())
     OContextStr << " @ ";
-  StringRef LeafLoc = LeafInlinedContext.back();
-  OContextStr << LeafLoc.split(":").first.str();
+  OContextStr << StringRef(LeafFrame).split(":").first.str();
 
   FunctionSamples &FunctionProile =
       getFunctionProfileForContext(OContextStr.str());
@@ -497,17 +498,18 @@ FunctionSamples &PseudoProbeCSProfileGenerator::getFunctionProfileForLeafProbe(
 }
 
 FunctionSamples &PseudoProbeCSProfileGenerator::getFunctionProfileForLeafProbe(
-    StringRef PrefixContextId, const PseudoProbe *LeafProbe,
+    SmallVectorImpl<std::string> &ContextStrStack, const PseudoProbe *LeafProbe,
     ProfiledBinary *Binary) {
-  SmallVector<std::string, 16> LeafInlinedContext;
-  Binary->getInlineContextForProbe(LeafProbe, LeafInlinedContext);
+  // Explicitly copy the context for appending the leaf context
+  SmallVector<std::string, 16> ContextStrStackCopy(ContextStrStack.begin(),
+                                                   ContextStrStack.end());
+  Binary->getInlineContextForProbe(LeafProbe, ContextStrStackCopy);
   // Note that the context from probe doesn't include leaf frame,
   // hence we need to retrieve and append the leaf frame.
   const auto *FuncDesc = Binary->getFuncDescForGUID(LeafProbe->GUID);
-  LeafInlinedContext.emplace_back(FuncDesc->FuncName + ":" +
-                                  Twine(LeafProbe->Index).str());
-  return getFunctionProfileForLeafProbe(PrefixContextId, LeafInlinedContext,
-                                        FuncDesc);
+  ContextStrStackCopy.emplace_back(FuncDesc->FuncName + ":" +
+                                   Twine(LeafProbe->Index).str());
+  return getFunctionProfileForLeafProbe(ContextStrStackCopy, FuncDesc);
 }
 
 } // end namespace sampleprof
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.h b/llvm/tools/llvm-profgen/ProfileGenerator.h
index 29f528026a0c..14e58fc9c895 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.h
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.h
@@ -50,6 +50,7 @@ class ProfileGenerator {
   */
   void findDisjointRanges(RangeSample &DisjointRanges,
                           const RangeSample &Ranges);
+
   // Used by SampleProfileWriter
   StringMap<FunctionSamples> ProfileMap;
 };
@@ -91,6 +92,111 @@ class CSProfileGenerator : public ProfileGenerator {
     populateInferredFunctionSamples();
   }
 
+  // Remove adjacent repeated context sequences up to a given sequence length,
+  // -1 means no size limit. Note that repeated sequences are identified based
+  // on the exact call site, this is finer granularity than function recursion.
+  template <typename T>
+  static void compressRecursionContext(SmallVectorImpl<T> &Context,
+                                       int32_t CSize = MaxCompressionSize) {
+    uint32_t I = 1;
+    uint32_t HS = static_cast<uint32_t>(Context.size() / 2);
+    uint32_t MaxDedupSize =
+        CSize == -1 ? HS : std::min(static_cast<uint32_t>(CSize), HS);
+    auto BeginIter = Context.begin();
+    // Use an in-place algorithm to save memory copy
+    // End indicates the end location of current iteration's data
+    uint32_t End = 0;
+    // Deduplicate from length 1 to the max possible size of a repeated
+    // sequence.
+    while (I <= MaxDedupSize) {
+      // This is a linear algorithm that deduplicates adjacent repeated
+      // sequences of size I. The deduplication detection runs on a sliding
+      // window whose size is 2*I and it keeps sliding the window to deduplicate
+      // the data inside. Once duplication is detected, deduplicate it by
+      // skipping the right half part of the window, otherwise just copy back
+      // the new one by appending them at the back of End pointer(for the next
+      // iteration).
+      //
+      // For example:
+      // Input: [a1, a2, b1, b2]
+      // (Added index to distinguish the same char, the origin is [a, a, b,
+      // b], the size of the dedup window is 2(I = 1) at the beginning)
+      //
+      // 1) The initial status is a dummy window[null, a1], then just copy the
+      // right half of the window(End = 0), then slide the window.
+      // Result: [a1], a2, b1, b2 (End points to the element right before ],
+      // after ] is the data of the previous iteration)
+      //
+      // 2) Next window is [a1, a2]. Since a1 == a2, then skip the right half of
+      // the window i.e the duplication happen. Only slide the window.
+      // Result: [a1], a2, b1, b2
+      //
+      // 3) Next window is [a2, b1], copy the right half of the window(b1 is
+      // new) to the End and slide the window.
+      // Result: [a1, b1], b1, b2
+      //
+      // 4) Next window is [b1, b2], same to 2), skip b2.
+      // Result: [a1, b1], b1, b2
+      // After resize, it will be [a, b]
+
+      // Use pointers like below to do comparison inside the window
+      //    [a         b         c        a       b        c]
+      //     |         |         |                |        |
+      // LeftBoundary Left     Right           Left+I    Right+I
+      // A duplication found if Left < LeftBoundry.
+
+      int32_t Right = I - 1;
+      End = I;
+      int32_t LeftBoundary = 0;
+      while (Right + I < Context.size()) {
+        // To avoids scanning a part of a sequence repeatedly, it finds out
+        // the common suffix of two hald in the window. The common suffix will
+        // serve as the common prefix of next possible pair of duplicate
+        // sequences. The non-common part will be ignored and never scanned
+        // again.
+
+        // For example.
+        // Input: [a, b1], c1, b2, c2
+        // I = 2
+        //
+        // 1) For the window [a, b1, c1, b2], non-common-suffix for the right
+        // part is 'c1', copy it and only slide the window 1 step.
+        // Result: [a, b1, c1], b2, c2
+        //
+        // 2) Next window is [b1, c1, b2, c2], so duplication happen.
+        // Result after resize: [a, b, c]
+
+        int32_t Left = Right;
+        while (Left >= LeftBoundary && Context[Left] == Context[Left + I]) {
+          // Find the longest suffix inside the window. When stops, Left points
+          // at the diverging point in the current sequence.
+          Left--;
+        }
+
+        bool DuplicationFound = (Left < LeftBoundary);
+        // Don't need to recheck the data before Right
+        LeftBoundary = Right + 1;
+        if (DuplicationFound) {
+          // Duplication found, skip right half of the window.
+          Right += I;
+        } else {
+          // Copy the non-common-suffix part of the adjacent sequence.
+          std::copy(BeginIter + Right + 1, BeginIter + Left + I + 1,
+                    BeginIter + End);
+          End += Left + I - Right;
+          // Only slide the window by the size of non-common-suffix
+          Right = Left + I;
+        }
+      }
+      // Don't forget the remaining part that's not scanned.
+      std::copy(BeginIter + Right + 1, Context.end(), BeginIter + End);
+      End += Context.size() - Right - 1;
+      I++;
+      Context.resize(End);
+      MaxDedupSize = std::min(static_cast<uint32_t>(End / 2), MaxDedupSize);
+    }
+  }
+
 protected:
   // Lookup or create FunctionSamples for the context
   FunctionSamples &getFunctionProfileForContext(StringRef ContextId);
@@ -109,6 +215,11 @@ class CSProfileGenerator : public ProfileGenerator {
                                        const BranchSample &BranchCounters,
                                        ProfiledBinary *Binary);
   void populateInferredFunctionSamples();
+
+public:
+  // Deduplicate adjacent repeated context sequences up to a given sequence
+  // length. -1 means no size limit.
+  static int32_t MaxCompressionSize;
 };
 
 using ProbeCounterMap = std::unordered_map<const PseudoProbe *, uint64_t>;
@@ -127,22 +238,23 @@ class PseudoProbeCSProfileGenerator : public CSProfileGenerator {
                               ProbeCounterMap &ProbeCounter,
                               ProfiledBinary *Binary);
   // Fill in function body samples from probes
-  void populateBodySamplesWithProbes(const RangeSample &RangeCounter,
-                                     StringRef PrefixContextId,
-                                     ProfiledBinary *Binary);
+  void
+  populateBodySamplesWithProbes(const RangeSample &RangeCounter,
+                                SmallVectorImpl<std::string> &ContextStrStack,
+                                ProfiledBinary *Binary);
   // Fill in boundary samples for a call probe
-  void populateBoundarySamplesWithProbes(const BranchSample &BranchCounter,
-                                         StringRef PrefixContextId,
-                                         ProfiledBinary *Binary);
+  void populateBoundarySamplesWithProbes(
+      const BranchSample &BranchCounter,
+      SmallVectorImpl<std::string> &ContextStrStack, ProfiledBinary *Binary);
   // Helper function to get FunctionSamples for the leaf inlined context
-  FunctionSamples &getFunctionProfileForLeafProbe(
-      StringRef PrefixContextId,
-      SmallVector<std::string, 16> &LeafInlinedContext,
-      const PseudoProbeFuncDesc *LeafFuncDesc);
+  FunctionSamples &
+  getFunctionProfileForLeafProbe(SmallVectorImpl<std::string> &ContextStrStack,
+                                 const PseudoProbeFuncDesc *LeafFuncDesc);
   // Helper function to get FunctionSamples for the leaf probe
-  FunctionSamples &getFunctionProfileForLeafProbe(StringRef PrefixContextId,
-                                                  const PseudoProbe *LeafProbe,
-                                                  ProfiledBinary *Binary);
+  FunctionSamples &
+  getFunctionProfileForLeafProbe(SmallVectorImpl<std::string> &ContextStrStack,
+                                 const PseudoProbe *LeafProbe,
+                                 ProfiledBinary *Binary);
 };
 
 } // end namespace sampleprof
diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.cpp b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
index 4b31dff8cd02..16ef04aba99e 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.cpp
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
@@ -8,6 +8,7 @@
 
 #include "ProfiledBinary.h"
 #include "ErrorHandling.h"
+#include "ProfileGenerator.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Demangle/Demangle.h"
 #include "llvm/Support/CommandLine.h"
@@ -128,7 +129,7 @@ bool ProfiledBinary::inlineContextEqual(uint64_t Address1,
 std::string
 ProfiledBinary::getExpandedContextStr(const std::list<uint64_t> &Stack) const {
   std::string ContextStr;
-  SmallVector<std::string, 8> ContextVec;
+  SmallVector<std::string, 16> ContextVec;
   // Process from frame root to leaf
   for (auto Iter = Stack.rbegin(); Iter != Stack.rend(); Iter++) {
     uint64_t Offset = virtualAddrToOffset(*Iter);
@@ -139,21 +140,22 @@ ProfiledBinary::getExpandedContextStr(const std::list<uint64_t> &Stack) const {
   }
 
   assert(ContextVec.size() && "Context length should be at least 1");
+  // Compress the context string except for the leaf frame
+  std::string LeafFrame = ContextVec.back();
+  ContextVec.pop_back();
+  CSProfileGenerator::compressRecursionContext<std::string>(ContextVec);
 
   std::ostringstream OContextStr;
   for (uint32_t I = 0; I < (uint32_t)ContextVec.size(); I++) {
     if (OContextStr.str().size()) {
       OContextStr << " @ ";
     }
-
-    if (I == ContextVec.size() - 1) {
-      // Only keep the function name for the leaf frame
-      StringRef Ref(ContextVec[I]);
-      OContextStr << Ref.split(":").first.str();
-    } else {
-      OContextStr << ContextVec[I];
-    }
+    OContextStr << ContextVec[I];
   }
+  // Only keep the function name for the leaf frame
+  if (OContextStr.str().size())
+    OContextStr << " @ ";
+  OContextStr << StringRef(LeafFrame).split(":").first.str();
   return OContextStr.str();
 }
 
diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.h b/llvm/tools/llvm-profgen/ProfiledBinary.h
index 40aee39677e5..bc28e58deb9d 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.h
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.h
@@ -243,7 +243,7 @@ class ProfiledBinary {
   }
   void
   getInlineContextForProbe(const PseudoProbe *Probe,
-                           SmallVector<std::string, 16> &InlineContextStack,
+                           SmallVectorImpl<std::string> &InlineContextStack,
                            bool IncludeLeaf = false) const {
     return ProbeDecoder.getInlineContextForProbe(Probe, InlineContextStack,
                                                  IncludeLeaf);
diff --git a/llvm/tools/llvm-profgen/PseudoProbe.cpp b/llvm/tools/llvm-profgen/PseudoProbe.cpp
index 700984e2184a..a537d9012e6d 100644
--- a/llvm/tools/llvm-profgen/PseudoProbe.cpp
+++ b/llvm/tools/llvm-profgen/PseudoProbe.cpp
@@ -34,7 +34,7 @@ void PseudoProbeFuncDesc::print(raw_ostream &OS) {
   OS << "Hash: " << FuncHash << "\n";
 }
 
-void PseudoProbe::getInlineContext(SmallVector<std::string, 16> &ContextStack,
+void PseudoProbe::getInlineContext(SmallVectorImpl<std::string> &ContextStack,
                                    const GUIDProbeFunctionMap &GUID2FuncMAP,
                                    bool ShowName) const {
   uint32_t Begin = ContextStack.size();
@@ -320,7 +320,7 @@ PseudoProbeDecoder::getFuncDescForGUID(uint64_t GUID) const {
 }
 
 void PseudoProbeDecoder::getInlineContextForProbe(
-    const PseudoProbe *Probe, SmallVector<std::string, 16> &InlineContextStack,
+    const PseudoProbe *Probe, SmallVectorImpl<std::string> &InlineContextStack,
     bool IncludeLeaf) const {
   Probe->getInlineContext(InlineContextStack, GUID2FuncDescMap, true);
   if (!IncludeLeaf)
diff --git a/llvm/tools/llvm-profgen/PseudoProbe.h b/llvm/tools/llvm-profgen/PseudoProbe.h
index a6647eb39c7a..207772453c97 100644
--- a/llvm/tools/llvm-profgen/PseudoProbe.h
+++ b/llvm/tools/llvm-profgen/PseudoProbe.h
@@ -138,7 +138,7 @@ struct PseudoProbe {
   // Get the inlined context by traversing current inline tree backwards,
   // each tree node has its InlineSite which is taken as the context.
   // \p ContextStack is populated in root to leaf order
-  void getInlineContext(SmallVector<std::string, 16> &ContextStack,
+  void getInlineContext(SmallVectorImpl<std::string> &ContextStack,
                         const GUIDProbeFunctionMap &GUID2FuncMAP,
                         bool ShowName) const;
   // Helper function to get the string from context stack
@@ -214,7 +214,7 @@ class PseudoProbeDecoder {
   //  IncludeLeaf = false, Output: [main:1, foo:2]
   void
   getInlineContextForProbe(const PseudoProbe *Probe,
-                           SmallVector<std::string, 16> &InlineContextStack,
+                           SmallVectorImpl<std::string> &InlineContextStack,
                            bool IncludeLeaf) const;
 
   const AddressProbesMap &getAddress2ProbesMap() const {
diff --git a/llvm/unittests/tools/CMakeLists.txt b/llvm/unittests/tools/CMakeLists.txt
index e7c7dca68d49..7861da8c0e38 100644
--- a/llvm/unittests/tools/CMakeLists.txt
+++ b/llvm/unittests/tools/CMakeLists.txt
@@ -7,4 +7,4 @@ endif()
 add_subdirectory(
   llvm-exegesis
 )
-
+add_subdirectory(llvm-profgen)
diff --git a/llvm/unittests/tools/llvm-profgen/CMakeLists.txt b/llvm/unittests/tools/llvm-profgen/CMakeLists.txt
new file mode 100644
index 000000000000..5a658cf70846
--- /dev/null
+++ b/llvm/unittests/tools/llvm-profgen/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(LLVM_LINK_COMPONENTS
+  Support
+  )
+
+add_llvm_unittest(LLVMProfgenTests
+    ContextCompressionTest.cpp
+  )
+
+target_link_libraries(LLVMProfgenTests PRIVATE LLVMTestingSupport)
+
+add_dependencies(LLVMProfgenTests intrinsics_gen)
diff --git a/llvm/unittests/tools/llvm-profgen/ContextCompressionTest.cpp b/llvm/unittests/tools/llvm-profgen/ContextCompressionTest.cpp
new file mode 100644
index 000000000000..7f0cee8878af
--- /dev/null
+++ b/llvm/unittests/tools/llvm-profgen/ContextCompressionTest.cpp
@@ -0,0 +1,36 @@
+//===-- ContextCompressionTest.cpp -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "../tools/llvm-profgen/ProfileGenerator.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace sampleprof;
+
+TEST(TestCompression, TestNoSizeLimit1) {
+  SmallVector<std::string, 16> Context = {"a", "b", "c", "a", "b", "c"};
+  SmallVector<std::string, 16> Expect = {"a", "b", "c"};
+  CSProfileGenerator::compressRecursionContext(Context, -1);
+  EXPECT_TRUE(std::equal(Context.begin(), Context.end(), Expect.begin()));
+}
+
+TEST(TestCompression, TestNoSizeLimit2) {
+  SmallVector<std::string, 16> Context = {"m", "a", "a", "b", "c", "a",
+                                          "b", "c", "b", "c", "d"};
+  SmallVector<std::string, 16> Expect = {"m", "a", "b", "c", "d"};
+  CSProfileGenerator::compressRecursionContext(Context, -1);
+  EXPECT_TRUE(std::equal(Context.begin(), Context.end(), Expect.begin()));
+}
+
+TEST(TestCompression, TestMaxDedupSize) {
+  SmallVector<std::string, 16> Context = {"m", "a", "a", "b", "c", "a",
+                                          "b", "c", "b", "c", "d"};
+  SmallVector<std::string, 16> Expect = {"m", "a", "b", "c",
+                                         "a", "b", "c", "d"};
+  CSProfileGenerator::compressRecursionContext(Context, 2);
+  EXPECT_TRUE(std::equal(Context.begin(), Context.end(), Expect.begin()));
+}

From e562ff08f634d814c1cd1e65e3428ca5308d3022 Mon Sep 17 00:00:00 2001
From: wlei <wlei@fb.com>
Date: Mon, 11 Jan 2021 12:47:22 -0800
Subject: [PATCH 139/318] [CSSPGO][llvm-profgen] Aggregate samples on call
 frame trie to speed up profile generation

For CS profile generation, the process of call stack unwinding is time-consuming since for each LBR entry we need linear time to generate the context( hash, compression, string concatenation). This change speeds up this by grouping all the call frame within one LBR sample into a trie and aggregating the result(sample counter) on it, deferring the context compression and string generation to the end of unwinding.

Specifically, it uses `StackLeaf` as the top frame on the stack and manipulates(pop or push a trie node) it dynamically during virtual unwinding so that the raw sample can just be recoded on the leaf node, the path(root to leaf) will represent its calling context. In the end, it traverses the trie and generates the context on the fly.

Results:
Our internal branch shows about 5X speed-up on some large workloads in SPEC06 benchmark.

Differential Revision: https://reviews.llvm.org/D94110
---
 llvm/tools/llvm-profgen/PerfReader.cpp     | 162 +++++++++++---------
 llvm/tools/llvm-profgen/PerfReader.h       | 167 ++++++++++++++++-----
 llvm/tools/llvm-profgen/ProfiledBinary.cpp |   8 +-
 llvm/tools/llvm-profgen/ProfiledBinary.h   |   3 +-
 4 files changed, 232 insertions(+), 108 deletions(-)

diff --git a/llvm/tools/llvm-profgen/PerfReader.cpp b/llvm/tools/llvm-profgen/PerfReader.cpp
index d05c665f8583..787bde28400f 100644
--- a/llvm/tools/llvm-profgen/PerfReader.cpp
+++ b/llvm/tools/llvm-profgen/PerfReader.cpp
@@ -28,11 +28,12 @@ void VirtualUnwinder::unwindCall(UnwindState &State) {
   // 2nd frame is in prolog/epilog. In the future, we will switch to
   // pro/epi tracker(Dwarf CFI) for the precise check.
   uint64_t Source = State.getCurrentLBRSource();
-  auto Iter = State.CallStack.begin();
-  if (State.CallStack.size() == 1 || *(++Iter) != Source) {
-    State.CallStack.front() = Source;
+  auto *ParentFrame = State.getParentFrame();
+  if (ParentFrame == State.getDummyRootPtr() ||
+      ParentFrame->Address != Source) {
+    State.switchToFrame(Source);
   } else {
-    State.CallStack.pop_front();
+    State.popFrame();
   }
   State.InstPtr.update(Source);
 }
@@ -41,26 +42,29 @@ void VirtualUnwinder::unwindLinear(UnwindState &State, uint64_t Repeat) {
   InstructionPointer &IP = State.InstPtr;
   uint64_t Target = State.getCurrentLBRTarget();
   uint64_t End = IP.Address;
-  if (State.getBinary()->usePseudoProbes()) {
+  if (Binary->usePseudoProbes()) {
+    // We don't need to top frame probe since it should be extracted
+    // from the range.
     // The outcome of the virtual unwinding with pseudo probes is a
     // map from a context key to the address range being unwound.
     // This means basically linear unwinding is not needed for pseudo
     // probes. The range will be simply recorded here and will be
     // converted to a list of pseudo probes to report in ProfileGenerator.
-    recordRangeCount(Target, End, State, Repeat);
+    State.getParentFrame()->recordRangeCount(Target, End, Repeat);
   } else {
     // Unwind linear execution part
+    uint64_t LeafAddr = State.CurrentLeafFrame->Address;
     while (IP.Address >= Target) {
       uint64_t PrevIP = IP.Address;
       IP.backward();
       // Break into segments for implicit call/return due to inlining
-      bool SameInlinee =
-          State.getBinary()->inlineContextEqual(PrevIP, IP.Address);
+      bool SameInlinee = Binary->inlineContextEqual(PrevIP, IP.Address);
       if (!SameInlinee || PrevIP == Target) {
-        recordRangeCount(PrevIP, End, State, Repeat);
+        State.switchToFrame(LeafAddr);
+        State.CurrentLeafFrame->recordRangeCount(PrevIP, End, Repeat);
         End = IP.Address;
       }
-      State.CallStack.front() = IP.Address;
+      LeafAddr = IP.Address;
     }
   }
 }
@@ -68,9 +72,9 @@ void VirtualUnwinder::unwindLinear(UnwindState &State, uint64_t Repeat) {
 void VirtualUnwinder::unwindReturn(UnwindState &State) {
   // Add extra frame as we unwind through the return
   const LBREntry &LBR = State.getCurrentLBR();
-  uint64_t CallAddr = State.getBinary()->getCallAddrFromFrameAddr(LBR.Target);
-  State.CallStack.front() = CallAddr;
-  State.CallStack.push_front(LBR.Source);
+  uint64_t CallAddr = Binary->getCallAddrFromFrameAddr(LBR.Target);
+  State.switchToFrame(CallAddr);
+  State.pushFrame(LBR.Source);
   State.InstPtr.update(LBR.Source);
 }
 
@@ -78,79 +82,100 @@ void VirtualUnwinder::unwindBranchWithinFrame(UnwindState &State) {
   // TODO: Tolerate tail call for now, as we may see tail call from libraries.
   // This is only for intra function branches, excluding tail calls.
   uint64_t Source = State.getCurrentLBRSource();
-  State.CallStack.front() = Source;
+  State.switchToFrame(Source);
   State.InstPtr.update(Source);
 }
 
-SampleCounter &
-VirtualUnwinder::getOrCreateCounter(const ProfiledBinary *Binary,
-                                    std::list<uint64_t> &CallStack) {
-  if (Binary->usePseudoProbes()) {
-    return getOrCreateCounterForProbe(Binary, CallStack);
-  }
+std::shared_ptr<StringBasedCtxKey> FrameStack::getContextKey() {
   std::shared_ptr<StringBasedCtxKey> KeyStr =
       std::make_shared<StringBasedCtxKey>();
-  KeyStr->Context = Binary->getExpandedContextStr(CallStack);
+  KeyStr->Context = Binary->getExpandedContextStr(Stack);
   KeyStr->genHashCode();
-  auto Ret =
-      CtxCounterMap->emplace(Hashable<ContextKey>(KeyStr), SampleCounter());
-  return Ret.first->second;
+  return KeyStr;
 }
 
-SampleCounter &
-VirtualUnwinder::getOrCreateCounterForProbe(const ProfiledBinary *Binary,
-                                            std::list<uint64_t> &CallStack) {
+std::shared_ptr<ProbeBasedCtxKey> ProbeStack::getContextKey() {
   std::shared_ptr<ProbeBasedCtxKey> ProbeBasedKey =
       std::make_shared<ProbeBasedCtxKey>();
-  if (CallStack.size() > 1) {
-    // We don't need to top frame probe since it should be extracted
-    // from the range.
-    // The top of stack is an instruction from the function where
-    // the LBR address range physcially resides. Strip it since
-    // the function is not a part of the call context. We also
-    // don't need its inline context since the probes being unwound
-    // come with an inline context all the way back to the uninlined
-    // function in their prefix tree.
-    auto Iter = CallStack.rbegin();
-    auto EndT = std::prev(CallStack.rend());
-    for (; Iter != EndT; Iter++) {
-      uint64_t Address = *Iter;
-      const PseudoProbe *CallProbe = Binary->getCallProbeForAddr(Address);
-      // We may not find a probe for a merged or external callsite.
-      // Callsite merging may cause the loss of original probe IDs.
-      // Cutting off the context from here since the inline will
-      // not know how to consume a context with unknown callsites.
-      if (!CallProbe)
-        break;
-      ProbeBasedKey->Probes.emplace_back(CallProbe);
-    }
+  for (auto CallProbe : Stack) {
+    ProbeBasedKey->Probes.emplace_back(CallProbe);
   }
   CSProfileGenerator::compressRecursionContext<const PseudoProbe *>(
       ProbeBasedKey->Probes);
   ProbeBasedKey->genHashCode();
-  Hashable<ContextKey> ContextId(ProbeBasedKey);
-  auto Ret = CtxCounterMap->emplace(ContextId, SampleCounter());
-  return Ret.first->second;
+  return ProbeBasedKey;
+}
+
+template <typename T>
+void VirtualUnwinder::collectSamplesFromFrame(UnwindState::ProfiledFrame *Cur,
+                                              T &Stack) {
+  if (Cur->RangeSamples.empty() && Cur->BranchSamples.empty())
+    return;
+
+  std::shared_ptr<ContextKey> Key = Stack.getContextKey();
+  auto Ret = CtxCounterMap->emplace(Hashable<ContextKey>(Key), SampleCounter());
+  SampleCounter &SCounter = Ret.first->second;
+  for (auto &Item : Cur->RangeSamples) {
+    uint64_t StartOffset = Binary->virtualAddrToOffset(std::get<0>(Item));
+    uint64_t EndOffset = Binary->virtualAddrToOffset(std::get<1>(Item));
+    SCounter.recordRangeCount(StartOffset, EndOffset, std::get<2>(Item));
+  }
+
+  for (auto &Item : Cur->BranchSamples) {
+    uint64_t SourceOffset = Binary->virtualAddrToOffset(std::get<0>(Item));
+    uint64_t TargetOffset = Binary->virtualAddrToOffset(std::get<1>(Item));
+    SCounter.recordBranchCount(SourceOffset, TargetOffset, std::get<2>(Item));
+  }
+}
+
+template <typename T>
+void VirtualUnwinder::collectSamplesFromFrameTrie(
+    UnwindState::ProfiledFrame *Cur, T &Stack) {
+  if (!Cur->isDummyRoot()) {
+    if (!Stack.pushFrame(Cur)) {
+      // Process truncated context
+      for (const auto &Item : Cur->Children) {
+        // Start a new traversal ignoring its bottom context
+        collectSamplesFromFrameTrie(Item.second.get());
+      }
+      return;
+    }
+  }
+
+  collectSamplesFromFrame(Cur, Stack);
+  // Process children frame
+  for (const auto &Item : Cur->Children) {
+    collectSamplesFromFrameTrie(Item.second.get(), Stack);
+  }
+  // Recover the call stack
+  Stack.popFrame();
 }
 
-void VirtualUnwinder::recordRangeCount(uint64_t Start, uint64_t End,
-                                       UnwindState &State, uint64_t Repeat) {
-  uint64_t StartOffset = State.getBinary()->virtualAddrToOffset(Start);
-  uint64_t EndOffset = State.getBinary()->virtualAddrToOffset(End);
-  SampleCounter &SCounter =
-      getOrCreateCounter(State.getBinary(), State.CallStack);
-  SCounter.recordRangeCount(StartOffset, EndOffset, Repeat);
+void VirtualUnwinder::collectSamplesFromFrameTrie(
+    UnwindState::ProfiledFrame *Cur) {
+  if (Binary->usePseudoProbes()) {
+    ProbeStack Stack(Binary);
+    collectSamplesFromFrameTrie<ProbeStack>(Cur, Stack);
+  } else {
+    FrameStack Stack(Binary);
+    collectSamplesFromFrameTrie<FrameStack>(Cur, Stack);
+  }
 }
 
 void VirtualUnwinder::recordBranchCount(const LBREntry &Branch,
                                         UnwindState &State, uint64_t Repeat) {
   if (Branch.IsArtificial)
     return;
-  uint64_t SourceOffset = State.getBinary()->virtualAddrToOffset(Branch.Source);
-  uint64_t TargetOffset = State.getBinary()->virtualAddrToOffset(Branch.Target);
-  SampleCounter &SCounter =
-      getOrCreateCounter(State.getBinary(), State.CallStack);
-  SCounter.recordBranchCount(SourceOffset, TargetOffset, Repeat);
+
+  if (Binary->usePseudoProbes()) {
+    // Same as recordRangeCount, We don't need to top frame probe since we will
+    // extract it from branch's source address
+    State.getParentFrame()->recordBranchCount(Branch.Source, Branch.Target,
+                                              Repeat);
+  } else {
+    State.CurrentLeafFrame->recordBranchCount(Branch.Source, Branch.Target,
+                                              Repeat);
+  }
 }
 
 bool VirtualUnwinder::unwind(const HybridSample *Sample, uint64_t Repeat) {
@@ -199,6 +224,8 @@ bool VirtualUnwinder::unwind(const HybridSample *Sample, uint64_t Repeat) {
     // Record `branch` with calling context after unwinding.
     recordBranchCount(Branch, State, Repeat);
   }
+  // As samples are aggregated on trie, record them into counter map
+  collectSamplesFromFrameTrie(State.getDummyRootPtr());
 
   return true;
 }
@@ -325,7 +352,8 @@ void PerfReader::printUnwinderOutput() {
 void PerfReader::unwindSamples() {
   for (const auto &Item : AggregatedSamples) {
     const HybridSample *Sample = dyn_cast<HybridSample>(Item.first.getPtr());
-    VirtualUnwinder Unwinder(&BinarySampleCounters[Sample->Binary]);
+    VirtualUnwinder Unwinder(&BinarySampleCounters[Sample->Binary],
+                             Sample->Binary);
     Unwinder.unwind(Sample, Item.second);
   }
 
@@ -334,7 +362,7 @@ void PerfReader::unwindSamples() {
 }
 
 bool PerfReader::extractLBRStack(TraceStream &TraceIt,
-                                 SmallVector<LBREntry, 16> &LBRStack,
+                                 SmallVectorImpl<LBREntry> &LBRStack,
                                  ProfiledBinary *Binary) {
   // The raw format of LBR stack is like:
   // 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ...
@@ -398,7 +426,7 @@ bool PerfReader::extractLBRStack(TraceStream &TraceIt,
 }
 
 bool PerfReader::extractCallstack(TraceStream &TraceIt,
-                                  std::list<uint64_t> &CallStack) {
+                                  SmallVectorImpl<uint64_t> &CallStack) {
   // The raw format of call stack is like:
   //            4005dc      # leaf frame
   //	          400634
diff --git a/llvm/tools/llvm-profgen/PerfReader.h b/llvm/tools/llvm-profgen/PerfReader.h
index 66649a060bc3..7eaa4b846259 100644
--- a/llvm/tools/llvm-profgen/PerfReader.h
+++ b/llvm/tools/llvm-profgen/PerfReader.h
@@ -133,7 +133,7 @@ struct HybridSample : public PerfSample {
   // Profiled binary that current frame address belongs to
   ProfiledBinary *Binary;
   // Call stack recorded in FILO(leaf to root) order
-  std::list<uint64_t> CallStack;
+  SmallVector<uint64_t, 16> CallStack;
   // LBR stack recorded in FIFO order
   SmallVector<LBREntry, 16> LBRStack;
 
@@ -147,7 +147,7 @@ struct HybridSample : public PerfSample {
     const HybridSample *Other = dyn_cast<HybridSample>(K);
     if (Other->Binary != Binary)
       return false;
-    const std::list<uint64_t> &OtherCallStack = Other->CallStack;
+    const SmallVector<uint64_t, 16> &OtherCallStack = Other->CallStack;
     const SmallVector<LBREntry, 16> &OtherLBRStack = Other->LBRStack;
 
     if (CallStack.size() != OtherCallStack.size() ||
@@ -193,14 +193,40 @@ using AggregatedCounter =
     std::unordered_map<Hashable<PerfSample>, uint64_t,
                        Hashable<PerfSample>::Hash, Hashable<PerfSample>::Equal>;
 
+using SampleVector = SmallVector<std::tuple<uint64_t, uint64_t, uint64_t>, 16>;
 // The state for the unwinder, it doesn't hold the data but only keep the
 // pointer/index of the data, While unwinding, the CallStack is changed
 // dynamicially and will be recorded as the context of the sample
 struct UnwindState {
   // Profiled binary that current frame address belongs to
   const ProfiledBinary *Binary;
-  // TODO: switch to use trie for call stack
-  std::list<uint64_t> CallStack;
+  // Call stack trie node
+  struct ProfiledFrame {
+    const uint64_t Address = 0;
+    ProfiledFrame *Parent;
+    SampleVector RangeSamples;
+    SampleVector BranchSamples;
+    std::unordered_map<uint64_t, std::unique_ptr<ProfiledFrame>> Children;
+
+    ProfiledFrame(uint64_t Addr = 0, ProfiledFrame *P = nullptr)
+        : Address(Addr), Parent(P) {}
+    ProfiledFrame *getOrCreateChildFrame(uint64_t Address) {
+      assert(Address && "Address can't be zero!");
+      auto Ret = Children.emplace(
+          Address, std::make_unique<ProfiledFrame>(Address, this));
+      return Ret.first->second.get();
+    }
+    void recordRangeCount(uint64_t Start, uint64_t End, uint64_t Count) {
+      RangeSamples.emplace_back(std::make_tuple(Start, End, Count));
+    }
+    void recordBranchCount(uint64_t Source, uint64_t Target, uint64_t Count) {
+      BranchSamples.emplace_back(std::make_tuple(Source, Target, Count));
+    }
+    bool isDummyRoot() { return Address == 0; }
+  };
+
+  ProfiledFrame DummyTrieRoot;
+  ProfiledFrame *CurrentLeafFrame;
   // Used to fall through the LBR stack
   uint32_t LBRIndex = 0;
   // Reference to HybridSample.LBRStack
@@ -208,19 +234,20 @@ struct UnwindState {
   // Used to iterate the address range
   InstructionPointer InstPtr;
   UnwindState(const HybridSample *Sample)
-      : Binary(Sample->Binary), CallStack(Sample->CallStack),
-        LBRStack(Sample->LBRStack),
-        InstPtr(Sample->Binary, Sample->CallStack.front()) {}
+      : Binary(Sample->Binary), LBRStack(Sample->LBRStack),
+        InstPtr(Sample->Binary, Sample->CallStack.front()) {
+    initFrameTrie(Sample->CallStack);
+  }
 
   bool validateInitialState() {
     uint64_t LBRLeaf = LBRStack[LBRIndex].Target;
-    uint64_t StackLeaf = CallStack.front();
+    uint64_t LeafAddr = CurrentLeafFrame->Address;
     // When we take a stack sample, ideally the sampling distance between the
     // leaf IP of stack and the last LBR target shouldn't be very large.
     // Use a heuristic size (0x100) to filter out broken records.
-    if (StackLeaf < LBRLeaf || StackLeaf >= LBRLeaf + 0x100) {
+    if (LeafAddr < LBRLeaf || LeafAddr >= LBRLeaf + 0x100) {
       WithColor::warning() << "Bogus trace: stack tip = "
-                           << format("%#010x", StackLeaf)
+                           << format("%#010x", LeafAddr)
                            << ", LBR tip = " << format("%#010x\n", LBRLeaf);
       return false;
     }
@@ -228,19 +255,40 @@ struct UnwindState {
   }
 
   void checkStateConsistency() {
-    assert(InstPtr.Address == CallStack.front() &&
+    assert(InstPtr.Address == CurrentLeafFrame->Address &&
            "IP should align with context leaf");
   }
 
-  std::string getExpandedContextStr() const {
-    return Binary->getExpandedContextStr(CallStack);
-  }
   const ProfiledBinary *getBinary() const { return Binary; }
   bool hasNextLBR() const { return LBRIndex < LBRStack.size(); }
   uint64_t getCurrentLBRSource() const { return LBRStack[LBRIndex].Source; }
   uint64_t getCurrentLBRTarget() const { return LBRStack[LBRIndex].Target; }
   const LBREntry &getCurrentLBR() const { return LBRStack[LBRIndex]; }
   void advanceLBR() { LBRIndex++; }
+
+  ProfiledFrame *getParentFrame() { return CurrentLeafFrame->Parent; }
+
+  void pushFrame(uint64_t Address) {
+    CurrentLeafFrame = CurrentLeafFrame->getOrCreateChildFrame(Address);
+  }
+
+  void switchToFrame(uint64_t Address) {
+    if (CurrentLeafFrame->Address == Address)
+      return;
+    CurrentLeafFrame = CurrentLeafFrame->Parent->getOrCreateChildFrame(Address);
+  }
+
+  void popFrame() { CurrentLeafFrame = CurrentLeafFrame->Parent; }
+
+  void initFrameTrie(const SmallVectorImpl<uint64_t> &CallStack) {
+    ProfiledFrame *Cur = &DummyTrieRoot;
+    for (auto Address : reverse(CallStack)) {
+      Cur = Cur->getOrCreateChildFrame(Address);
+    }
+    CurrentLeafFrame = Cur;
+  }
+
+  ProfiledFrame *getDummyRootPtr() { return &DummyTrieRoot; }
 };
 
 // Base class for sample counter key with context
@@ -330,6 +378,56 @@ using ContextSampleCounterMap =
     std::unordered_map<Hashable<ContextKey>, SampleCounter,
                        Hashable<ContextKey>::Hash, Hashable<ContextKey>::Equal>;
 
+struct FrameStack {
+  SmallVector<uint64_t, 16> Stack;
+  const ProfiledBinary *Binary;
+  FrameStack(const ProfiledBinary *B) : Binary(B) {}
+  bool pushFrame(UnwindState::ProfiledFrame *Cur) {
+    Stack.push_back(Cur->Address);
+    return true;
+  }
+
+  void popFrame() {
+    if (!Stack.empty())
+      Stack.pop_back();
+  }
+  std::shared_ptr<StringBasedCtxKey> getContextKey();
+};
+
+struct ProbeStack {
+  SmallVector<const PseudoProbe *, 16> Stack;
+  const ProfiledBinary *Binary;
+  ProbeStack(const ProfiledBinary *B) : Binary(B) {}
+  bool pushFrame(UnwindState::ProfiledFrame *Cur) {
+    const PseudoProbe *CallProbe = Binary->getCallProbeForAddr(Cur->Address);
+    // We may not find a probe for a merged or external callsite.
+    // Callsite merging may cause the loss of original probe IDs.
+    // Cutting off the context from here since the inliner will
+    // not know how to consume a context with unknown callsites.
+    if (!CallProbe)
+      return false;
+    Stack.push_back(CallProbe);
+    return true;
+  }
+
+  void popFrame() {
+    if (!Stack.empty())
+      Stack.pop_back();
+  }
+  // Use pseudo probe based context key to get the sample counter
+  // A context stands for a call path from 'main' to an uninlined
+  // callee with all inline frames recovered on that path. The probes
+  // belonging to that call path is the probes either originated from
+  // the callee or from any functions inlined into the callee. Since
+  // pseudo probes are organized in a tri-tree style after decoded,
+  // the tree path from the tri-tree root (which is the uninlined
+  // callee) to the probe node forms an inline context.
+  // Here we use a list of probe(pointer) as the context key to speed up
+  // aggregation and the final context string will be generate in
+  // ProfileGenerator
+  std::shared_ptr<ProbeBasedCtxKey> getContextKey();
+};
+
 /*
 As in hybrid sample we have a group of LBRs and the most recent sampling call
 stack, we can walk through those LBRs to infer more call stacks which would be
@@ -351,47 +449,43 @@ range as sample counter for further CS profile generation.
 */
 class VirtualUnwinder {
 public:
-  VirtualUnwinder(ContextSampleCounterMap *Counter) : CtxCounterMap(Counter) {}
+  VirtualUnwinder(ContextSampleCounterMap *Counter, const ProfiledBinary *B)
+      : CtxCounterMap(Counter), Binary(B) {}
+  bool unwind(const HybridSample *Sample, uint64_t Repeat);
 
+private:
   bool isCallState(UnwindState &State) const {
     // The tail call frame is always missing here in stack sample, we will
     // use a specific tail call tracker to infer it.
-    return State.getBinary()->addressIsCall(State.getCurrentLBRSource());
+    return Binary->addressIsCall(State.getCurrentLBRSource());
   }
 
   bool isReturnState(UnwindState &State) const {
     // Simply check addressIsReturn, as ret is always reliable, both for
     // regular call and tail call.
-    return State.getBinary()->addressIsReturn(State.getCurrentLBRSource());
+    return Binary->addressIsReturn(State.getCurrentLBRSource());
   }
 
   void unwindCall(UnwindState &State);
   void unwindLinear(UnwindState &State, uint64_t Repeat);
   void unwindReturn(UnwindState &State);
   void unwindBranchWithinFrame(UnwindState &State);
-  bool unwind(const HybridSample *Sample, uint64_t Repeat);
+
+  template <typename T>
+  void collectSamplesFromFrame(UnwindState::ProfiledFrame *Cur, T &Stack);
+  // Collect each samples on trie node by DFS traversal
+  template <typename T>
+  void collectSamplesFromFrameTrie(UnwindState::ProfiledFrame *Cur, T &Stack);
+  void collectSamplesFromFrameTrie(UnwindState::ProfiledFrame *Cur);
+
   void recordRangeCount(uint64_t Start, uint64_t End, UnwindState &State,
                         uint64_t Repeat);
   void recordBranchCount(const LBREntry &Branch, UnwindState &State,
                          uint64_t Repeat);
-  SampleCounter &getOrCreateCounter(const ProfiledBinary *Binary,
-                                    std::list<uint64_t> &CallStack);
-  // Use pseudo probe based context key to get the sample counter
-  // A context stands for a call path from 'main' to an uninlined
-  // callee with all inline frames recovered on that path. The probes
-  // belonging to that call path is the probes either originated from
-  // the callee or from any functions inlined into the callee. Since
-  // pseudo probes are organized in a tri-tree style after decoded,
-  // the tree path from the tri-tree root (which is the uninlined
-  // callee) to the probe node forms an inline context.
-  // Here we use a list of probe(pointer) as the context key to speed up
-  // aggregation and the final context string will be generate in
-  // ProfileGenerator
-  SampleCounter &getOrCreateCounterForProbe(const ProfiledBinary *Binary,
-                                            std::list<uint64_t> &CallStack);
 
-private:
   ContextSampleCounterMap *CtxCounterMap;
+  // Profiled binary that current frame address belongs to
+  const ProfiledBinary *Binary;
 };
 
 // Filename to binary map
@@ -457,10 +551,11 @@ class PerfReader {
   // Parse the hybrid sample including the call and LBR line
   void parseHybridSample(TraceStream &TraceIt);
   // Extract call stack from the perf trace lines
-  bool extractCallstack(TraceStream &TraceIt, std::list<uint64_t> &CallStack);
+  bool extractCallstack(TraceStream &TraceIt,
+                        SmallVectorImpl<uint64_t> &CallStack);
   // Extract LBR stack from one perf trace line
   bool extractLBRStack(TraceStream &TraceIt,
-                       SmallVector<LBREntry, 16> &LBRStack,
+                       SmallVectorImpl<LBREntry> &LBRStack,
                        ProfiledBinary *Binary);
   void checkAndSetPerfType(cl::list<std::string> &PerfTraceFilenames);
   // Post process the profile after trace aggregation, we will do simple range
diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.cpp b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
index 16ef04aba99e..2c6cedf57649 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.cpp
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
@@ -126,13 +126,13 @@ bool ProfiledBinary::inlineContextEqual(uint64_t Address1,
                     Context2.begin(), Context2.begin() + Context2.size() - 1);
 }
 
-std::string
-ProfiledBinary::getExpandedContextStr(const std::list<uint64_t> &Stack) const {
+std::string ProfiledBinary::getExpandedContextStr(
+    const SmallVectorImpl<uint64_t> &Stack) const {
   std::string ContextStr;
   SmallVector<std::string, 16> ContextVec;
   // Process from frame root to leaf
-  for (auto Iter = Stack.rbegin(); Iter != Stack.rend(); Iter++) {
-    uint64_t Offset = virtualAddrToOffset(*Iter);
+  for (auto Address : Stack) {
+    uint64_t Offset = virtualAddrToOffset(Address);
     const FrameLocationStack &ExpandedContext = getFrameLocationStack(Offset);
     for (const auto &Loc : ExpandedContext) {
       ContextVec.push_back(getCallSite(Loc));
diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.h b/llvm/tools/llvm-profgen/ProfiledBinary.h
index bc28e58deb9d..f6c7460e186d 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.h
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.h
@@ -236,7 +236,8 @@ class ProfiledBinary {
   // Get the context string of the current stack with inline context filled in.
   // It will search the disassembling info stored in Offset2LocStackMap. This is
   // used as the key of function sample map
-  std::string getExpandedContextStr(const std::list<uint64_t> &stack) const;
+  std::string
+  getExpandedContextStr(const SmallVectorImpl<uint64_t> &Stack) const;
 
   const PseudoProbe *getCallProbeForAddr(uint64_t Address) const {
     return ProbeDecoder.getCallProbeForAddr(Address);

From 87c27020cc6466ae33550f1f1f55d5989afaca2e Mon Sep 17 00:00:00 2001
From: wlei <wlei@fb.com>
Date: Thu, 21 Jan 2021 09:36:32 -0800
Subject: [PATCH 140/318] [CSSPGO][llvm-profgen] Merge and trim profile for
 cold context to reduce profile size

This change allows merging and trimming cold context profile in llvm-profgen to solve profile size bloat problem. Currently when the profile's total sample is below threshold(supported by a switch), it will be considered cold and merged into a base context-less profile, which will at least keep the profile quality as good as the baseline(non-cs).

For example, two input profiles:
 [main @ foo @ bar]:60
 [main @ bar]:50
Under threshold = 100, the two profiles will be merge into one with the base context, get result:
 [bar]:110

Added two switches:
`--csprof-cold-thres=<value>`: Specified the total samples threshold for a context profile to be considered cold, with 100 being the default. Any cold context profiles will be merged into context-less base profile by default.
`--csprof-keep-cold`: Force profile generation to keep cold context profiles instead of dropping them. By default, any cold context will not be written to output profile.

Results:
Though not yet evaluating it with the latest CSSPGO, our internal branch shows neutral on performance but significantly reduce the profile size. Detailed evaluation on llvm-profgen with CSSPGO will come later.

Differential Revision: https://reviews.llvm.org/D94111
---
 .../tools/llvm-profgen/inline-cs-noprobe.test |  2 +-
 .../llvm-profgen/inline-cs-pseudoprobe.test   |  2 +-
 .../llvm-profgen/merge-cold-profile.test      | 70 +++++++++++++++++++
 .../llvm-profgen/noinline-cs-noprobe.test     |  2 +-
 .../llvm-profgen/noinline-cs-pseudoprobe.test |  2 +-
 .../recursion-compression-noprobe.test        |  4 +-
 .../recursion-compression-pseudoprobe.test    |  4 +-
 llvm/tools/llvm-profgen/ProfileGenerator.cpp  | 57 +++++++++++++++
 llvm/tools/llvm-profgen/ProfileGenerator.h    |  8 ++-
 9 files changed, 142 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/tools/llvm-profgen/merge-cold-profile.test

diff --git a/llvm/test/tools/llvm-profgen/inline-cs-noprobe.test b/llvm/test/tools/llvm-profgen/inline-cs-noprobe.test
index 98767a9b29b7..943832ebef10 100644
--- a/llvm/test/tools/llvm-profgen/inline-cs-noprobe.test
+++ b/llvm/test/tools/llvm-profgen/inline-cs-noprobe.test
@@ -1,4 +1,4 @@
-; RUN: llvm-profgen --perfscript=%S/Inputs/inline-cs-noprobe.perfscript --binary=%S/Inputs/inline-cs-noprobe.perfbin --output=%t --show-unwinder-output | FileCheck %s --check-prefix=CHECK-UNWINDER
+; RUN: llvm-profgen --perfscript=%S/Inputs/inline-cs-noprobe.perfscript --binary=%S/Inputs/inline-cs-noprobe.perfbin --output=%t --show-unwinder-output --csprof-cold-thres=0 | FileCheck %s --check-prefix=CHECK-UNWINDER
 ; RUN: FileCheck %s --input-file %t
 
 ; CHECK:[main:1 @ foo]:44:0
diff --git a/llvm/test/tools/llvm-profgen/inline-cs-pseudoprobe.test b/llvm/test/tools/llvm-profgen/inline-cs-pseudoprobe.test
index 19928322a66d..c7aa1dea21bb 100644
--- a/llvm/test/tools/llvm-profgen/inline-cs-pseudoprobe.test
+++ b/llvm/test/tools/llvm-profgen/inline-cs-pseudoprobe.test
@@ -1,4 +1,4 @@
-; RUN: llvm-profgen --perfscript=%S/Inputs/inline-cs-pseudoprobe.perfscript --binary=%S/Inputs/inline-cs-pseudoprobe.perfbin --output=%t --show-unwinder-output | FileCheck %s --check-prefix=CHECK-UNWINDER
+; RUN: llvm-profgen --perfscript=%S/Inputs/inline-cs-pseudoprobe.perfscript --binary=%S/Inputs/inline-cs-pseudoprobe.perfbin --output=%t --show-unwinder-output --csprof-cold-thres=0 | FileCheck %s --check-prefix=CHECK-UNWINDER
 ; RUN: FileCheck %s --input-file %t
 
 ; CHECK:     [main:2 @ foo]:74:0
diff --git a/llvm/test/tools/llvm-profgen/merge-cold-profile.test b/llvm/test/tools/llvm-profgen/merge-cold-profile.test
new file mode 100644
index 000000000000..e0c65ac44e2b
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/merge-cold-profile.test
@@ -0,0 +1,70 @@
+; Used the data from recursion-compression.test, refer it for the unmerged output
+; RUN: llvm-profgen --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=-1 --csprof-cold-thres=8
+; RUN: FileCheck %s --input-file %t
+
+; Test --csprof-keep-cold
+; RUN: llvm-profgen --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=-1 --csprof-cold-thres=100 --csprof-keep-cold
+; RUN: FileCheck %s --input-file %t --check-prefix=CHECK-KEEP-COLD
+
+; CHECK:     [fa]:14:4
+; CHECK-NEXT: 1: 4
+; CHECK-NEXT: 3: 4
+; CHECK-NEXT: 4: 2
+; CHECK-NEXT: 5: 1
+; CHECK-NEXT: 7: 2 fb:2
+; CHECK-NEXT: 8: 1 fa:1
+; CHECK-NEXT: !CFGChecksum: 120515930909
+; CHECK-NEXT:[main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb]:13:4
+; CHECK-NEXT: 1: 4
+; CHECK-NEXT: 2: 3
+; CHECK-NEXT: 3: 1
+; CHECK-NEXT: 5: 4 fb:4
+; CHECK-NEXT: 6: 1 fa:1
+; CHECK-NEXT: !CFGChecksum: 72617220756
+
+; CHECK-KEEP-COLD:     [fb]:19:6
+; CHECK-KEEP-COLD-NEXT: 1: 6
+; CHECK-KEEP-COLD-NEXT: 2: 3
+; CHECK-KEEP-COLD-NEXT: 3: 3
+; CHECK-KEEP-COLD-NEXT: 5: 4 fb:4
+; CHECK-KEEP-COLD-NEXT: 6: 3 fa:3
+; CHECK-KEEP-COLD-NEXT: !CFGChecksum: 72617220756
+; CHECK-KEEP-COLD-NEXT:[fa]:14:4
+; CHECK-KEEP-COLD-NEXT: 1: 4
+; CHECK-KEEP-COLD-NEXT: 3: 4
+; CHECK-KEEP-COLD-NEXT: 4: 2
+; CHECK-KEEP-COLD-NEXT: 5: 1
+; CHECK-KEEP-COLD-NEXT: 7: 2 fb:2
+; CHECK-KEEP-COLD-NEXT: 8: 1 fa:1
+; CHECK-KEEP-COLD-NEXT: !CFGChecksum: 120515930909
+
+
+; clang -O3 -fexperimental-new-pass-manager -fuse-ld=lld -fpseudo-probe-for-profiling
+; -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -Xclang -mdisable-tail-calls
+; -g test.c  -o a.out
+
+; Copied from recursion-compression.test
+#include <stdio.h>
+
+int fb(int n) {
+  if(n > 10) return fb(n / 2);
+  return fa(n - 1);
+}
+
+int fa(int n) {
+  if(n < 2) return n;
+  if(n % 2) return fb(n - 1);
+  return fa(n - 1);
+}
+
+void foo() {
+  int s, i = 0;
+  while (i++ < 10000)
+    s += fa(i);
+  printf("sum is %d\n", s);
+}
+
+int main() {
+  foo();
+  return 0;
+}
diff --git a/llvm/test/tools/llvm-profgen/noinline-cs-noprobe.test b/llvm/test/tools/llvm-profgen/noinline-cs-noprobe.test
index 9beecb271fc0..2e60883afa62 100644
--- a/llvm/test/tools/llvm-profgen/noinline-cs-noprobe.test
+++ b/llvm/test/tools/llvm-profgen/noinline-cs-noprobe.test
@@ -1,4 +1,4 @@
-; RUN: llvm-profgen --perfscript=%S/Inputs/noinline-cs-noprobe.perfscript --binary=%S/Inputs/noinline-cs-noprobe.perfbin --output=%t --show-unwinder-output | FileCheck %s --check-prefix=CHECK-UNWINDER
+; RUN: llvm-profgen --perfscript=%S/Inputs/noinline-cs-noprobe.perfscript --binary=%S/Inputs/noinline-cs-noprobe.perfbin --output=%t --show-unwinder-output --csprof-cold-thres=0 | FileCheck %s --check-prefix=CHECK-UNWINDER
 ; RUN: FileCheck %s --input-file %t
 
 ; CHECK:[main:1 @ foo:3 @ bar]:12:3
diff --git a/llvm/test/tools/llvm-profgen/noinline-cs-pseudoprobe.test b/llvm/test/tools/llvm-profgen/noinline-cs-pseudoprobe.test
index 0491a62ff69b..a0e5507c70dd 100644
--- a/llvm/test/tools/llvm-profgen/noinline-cs-pseudoprobe.test
+++ b/llvm/test/tools/llvm-profgen/noinline-cs-pseudoprobe.test
@@ -1,4 +1,4 @@
-; RUN: llvm-profgen --perfscript=%S/Inputs/noinline-cs-pseudoprobe.perfscript --binary=%S/Inputs/noinline-cs-pseudoprobe.perfbin --output=%t --show-unwinder-output | FileCheck %s --check-prefix=CHECK-UNWINDER
+; RUN: llvm-profgen --perfscript=%S/Inputs/noinline-cs-pseudoprobe.perfscript --binary=%S/Inputs/noinline-cs-pseudoprobe.perfbin --output=%t --show-unwinder-output --csprof-cold-thres=0 | FileCheck %s --check-prefix=CHECK-UNWINDER
 ; RUN: FileCheck %s --input-file %t
 
 ; CHECK:     [main:2 @ foo]:75:0
diff --git a/llvm/test/tools/llvm-profgen/recursion-compression-noprobe.test b/llvm/test/tools/llvm-profgen/recursion-compression-noprobe.test
index 47e0a51a4261..43f495398bb0 100644
--- a/llvm/test/tools/llvm-profgen/recursion-compression-noprobe.test
+++ b/llvm/test/tools/llvm-profgen/recursion-compression-noprobe.test
@@ -1,7 +1,7 @@
 ; Firstly test uncompression(--compress-recursion=0)
-; RUN: llvm-profgen --perfscript=%S/Inputs/recursion-compression-noprobe.perfscript --binary=%S/Inputs/recursion-compression-noprobe.perfbin --output=%t --compress-recursion=0
+; RUN: llvm-profgen --perfscript=%S/Inputs/recursion-compression-noprobe.perfscript --binary=%S/Inputs/recursion-compression-noprobe.perfbin --output=%t --compress-recursion=0 --csprof-cold-thres=0
 ; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-UNCOMPRESS
-; RUN: llvm-profgen --perfscript=%S/Inputs/recursion-compression-noprobe.perfscript --binary=%S/Inputs/recursion-compression-noprobe.perfbin --output=%t
+; RUN: llvm-profgen --perfscript=%S/Inputs/recursion-compression-noprobe.perfscript --binary=%S/Inputs/recursion-compression-noprobe.perfbin --output=%t --csprof-cold-thres=0
 ; RUN: FileCheck %s --input-file %t
 
 ; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa]:14:0
diff --git a/llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test b/llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test
index 86afe6c632bd..0d4e7dbb1dd4 100644
--- a/llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test
+++ b/llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test
@@ -1,7 +1,7 @@
 ; Firstly test uncompression(--compress-recursion=0)
-; RUN: llvm-profgen --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=0
+; RUN: llvm-profgen --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=0 --csprof-cold-thres=0
 ; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-UNCOMPRESS
-; RUN: llvm-profgen --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --show-unwinder-output | FileCheck %s --check-prefix=CHECK-UNWINDER
+; RUN: llvm-profgen --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --show-unwinder-output --csprof-cold-thres=0 | FileCheck %s --check-prefix=CHECK-UNWINDER
 ; RUN: FileCheck %s --input-file %t
 
 ; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:6 @ fa:8 @ fa:7 @ fb:6 @ fa]:4:1
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
index f769bd592f87..b2a8d60d5caf 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -29,6 +29,19 @@ static cl::opt<int32_t, true> RecursionCompression(
     cl::Hidden,
     cl::location(llvm::sampleprof::CSProfileGenerator::MaxCompressionSize));
 
+static cl::opt<uint64_t> CSProfColdThres(
+    "csprof-cold-thres", cl::init(100), cl::ZeroOrMore,
+    cl::desc("Specify the total samples threshold for a context profile to "
+             "be considered cold, any cold profiles will be merged into "
+             "context-less base profiles"));
+
+static cl::opt<bool> CSProfKeepCold(
+    "csprof-keep-cold", cl::init(false), cl::ZeroOrMore,
+    cl::desc("This works together with --csprof-cold-thres. If the total count "
+             "of the profile after all merge is done is still smaller than the "
+             "csprof-cold-thres, it will be trimmed unless csprof-keep-cold "
+             "flag is specified."));
+
 using namespace llvm;
 using namespace sampleprof;
 
@@ -68,6 +81,7 @@ void ProfileGenerator::write() {
   if (std::error_code EC = WriterOrErr.getError())
     exitWithError(EC, OutputFilename);
   auto Writer = std::move(WriterOrErr.get());
+  mergeAndTrimColdProfile(ProfileMap);
   Writer->write(ProfileMap);
 }
 
@@ -329,6 +343,49 @@ void CSProfileGenerator::populateInferredFunctionSamples() {
   }
 }
 
+void CSProfileGenerator::mergeAndTrimColdProfile(
+    StringMap<FunctionSamples> &ProfileMap) {
+  // Nothing to merge if sample threshold is zero
+  if (!CSProfColdThres)
+    return;
+
+  // Filter the cold profiles from ProfileMap and move them into a tmp
+  // container
+  std::vector<std::pair<StringRef, const FunctionSamples *>> ToRemoveVec;
+  for (const auto &I : ProfileMap) {
+    const FunctionSamples &FunctionProfile = I.second;
+    if (FunctionProfile.getTotalSamples() >= CSProfColdThres)
+      continue;
+    ToRemoveVec.emplace_back(I.getKey(), &I.second);
+  }
+
+  // Remove the code profile from ProfileMap and merge them into BaseProileMap
+  StringMap<FunctionSamples> BaseProfileMap;
+  for (const auto &I : ToRemoveVec) {
+    auto Ret =
+        BaseProfileMap.try_emplace(I.second->getName(), FunctionSamples());
+    FunctionSamples &BaseProfile = Ret.first->second;
+    BaseProfile.merge(*I.second);
+    ProfileMap.erase(I.first);
+  }
+
+  // Merge the base profiles into ProfileMap;
+  for (const auto &I : BaseProfileMap) {
+    // Filter the cold base profile
+    if (!CSProfKeepCold && I.second.getTotalSamples() < CSProfColdThres &&
+        ProfileMap.find(I.getKey()) == ProfileMap.end())
+      continue;
+    // Merge the profile if the original profile exists, otherwise just insert
+    // as a new profile
+    FunctionSamples &OrigProfile = getFunctionProfileForContext(I.getKey());
+    StringRef TmpName = OrigProfile.getName();
+    OrigProfile.merge(I.second);
+    // Should use the name ref from ProfileMap's key to avoid name being freed
+    // from BaseProfileMap
+    OrigProfile.setName(TmpName);
+  }
+}
+
 // Helper function to extract context prefix string stack
 // Extract context stack for reusing, leaf context stack will
 // be added compressed while looking up function profile
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.h b/llvm/tools/llvm-profgen/ProfileGenerator.h
index 14e58fc9c895..9cb04c4de34d 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.h
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.h
@@ -28,7 +28,10 @@ class ProfileGenerator {
   create(const BinarySampleCounterMap &BinarySampleCounters,
          enum PerfScriptType SampleType);
   virtual void generateProfile() = 0;
-
+  // Merge and trim profile with cold context before serialization,
+  // only eligible for CS profile
+  virtual void
+  mergeAndTrimColdProfile(StringMap<FunctionSamples> &ProfileMap){};
   // Use SampleProfileWriter to serialize profile map
   void write();
 
@@ -200,6 +203,9 @@ class CSProfileGenerator : public ProfileGenerator {
 protected:
   // Lookup or create FunctionSamples for the context
   FunctionSamples &getFunctionProfileForContext(StringRef ContextId);
+  // Merge cold context profile whose total sample is below threshold
+  // into base profile.
+  void mergeAndTrimColdProfile(StringMap<FunctionSamples> &ProfileMap) override;
 
 private:
   // Helper function for updating body sample for a leaf location in

From db88d92217f185d9ab5b8f0a0eddc5dc9ad30659 Mon Sep 17 00:00:00 2001
From: wlei <wlei@fb.com>
Date: Wed, 20 Jan 2021 17:54:03 -0800
Subject: [PATCH 141/318] [CSSPGO][llvm-profgen] Fix bug with parsing hybrid
 sample trace line

when we skip the call stack starting with an external address, we should also skip the bottom LBR entry, otherwise it will cause a truncated context issue.

Reviewed By: hoy, wenlei

Differential Revision: https://reviews.llvm.org/D95480
---
 .../llvm-profgen/Inputs/inline-cs-noprobe.perfscript |  6 ++++++
 llvm/tools/llvm-profgen/PerfReader.cpp               | 12 +++++++-----
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/llvm/test/tools/llvm-profgen/Inputs/inline-cs-noprobe.perfscript b/llvm/test/tools/llvm-profgen/Inputs/inline-cs-noprobe.perfscript
index 7ef76dcd3884..116bd0a2c4c1 100644
--- a/llvm/test/tools/llvm-profgen/Inputs/inline-cs-noprobe.perfscript
+++ b/llvm/test/tools/llvm-profgen/Inputs/inline-cs-noprobe.perfscript
@@ -1,5 +1,11 @@
 PERF_RECORD_MMAP2 2854748/2854748: [0x400000(0x1000) @ 0 00:1d 123291722 526021]: r-xp /home/inline-cs-noprobe.perfbin
 
+; test for an external or invalid top address, should skip the whole sample
+
+	        ffffffff
+	          40067e
+	5541f689495641d7
+ 0x4006c8/0x40067e/P/-/-/0  0x4006c8/0x40067e/P/-/-/0  0x4006c8/0x40067e/P/-/-/0  0x4006c8/0x40067e/P/-/-/0  0x4006c8/0x40067e/P/-/-/0  0x4006c8/0x40067e/P/-/-/0  0x4006c8/0x40067e/P/-/-/0  0x4006c8/0x40067e/P/-/-/0  0x4006c8/0x40067e/P/-/-/0  0x4006c8/0x40067e/P/-/-/0  0x4006c8/0x40067e/P/-/-/0  0x4006c8/0x40067e/P/-/-/0  0x40069b/0x400670/M/-/-/0  0x4006c8/0x40067e/P/-/-/0  0x4006c8/0x40067e/P/-/-/0  0x4006c8/0x40067e/P/-/-/0
 
 	          40067e
 	5541f689495641d7
diff --git a/llvm/tools/llvm-profgen/PerfReader.cpp b/llvm/tools/llvm-profgen/PerfReader.cpp
index 787bde28400f..e59d8d93381b 100644
--- a/llvm/tools/llvm-profgen/PerfReader.cpp
+++ b/llvm/tools/llvm-profgen/PerfReader.cpp
@@ -437,11 +437,12 @@ bool PerfReader::extractCallstack(TraceStream &TraceIt,
   ProfiledBinary *Binary = nullptr;
   while (!TraceIt.isAtEoF() && !TraceIt.getCurrentLine().startswith(" 0x")) {
     StringRef FrameStr = TraceIt.getCurrentLine().ltrim();
-    // We might get an empty line at the beginning or comments, skip it
     uint64_t FrameAddr = 0;
     if (FrameStr.getAsInteger(16, FrameAddr)) {
+      // We might parse a non-perf sample line like empty line and comments,
+      // skip it
       TraceIt.advance();
-      break;
+      return false;
     }
     TraceIt.advance();
     if (!Binary) {
@@ -468,9 +469,9 @@ bool PerfReader::extractCallstack(TraceStream &TraceIt,
     CallStack.emplace_back(FrameAddr);
   }
 
-  if (CallStack.empty())
-    return false;
   // Skip other unrelated line, find the next valid LBR line
+  // Note that even for empty call stack, we should skip the address at the
+  // bottom, otherwise the following pass may generate a truncated callstack
   while (!TraceIt.isAtEoF() && !TraceIt.getCurrentLine().startswith(" 0x")) {
     TraceIt.advance();
   }
@@ -482,7 +483,8 @@ bool PerfReader::extractCallstack(TraceStream &TraceIt,
   // of such case - when sample landed in prolog/epilog, somehow stack
   // walking will be broken in an unexpected way that higher frames will be
   // missing.
-  return !Binary->addressInPrologEpilog(CallStack.front());
+  return !CallStack.empty() &&
+         !Binary->addressInPrologEpilog(CallStack.front());
 }
 
 void PerfReader::parseHybridSample(TraceStream &TraceIt) {

From 10712791a9affbea8e6fa474d8a857ea6dfbb955 Mon Sep 17 00:00:00 2001
From: Wenlei He <aktoon@gmail.com>
Date: Wed, 3 Feb 2021 13:27:35 -0800
Subject: [PATCH 142/318] [CSSPGO] Use merged base profile for hot threshold
 calculation

Context-sensitive profile effectively split a function profile into many copies each representing the CFG profile of a particular calling context. That makes the count distribution looks more flat as we now have more function profiles each with lower counts, which in turn leads to lower hot thresholds. Now we tells threshold computation to merge context profile first before calculating percentile based cutoffs to compensate for seemingly flat context profile. This can be controlled by swtich `sample-profile-contextless-threshold`.

Earlier measurement showed ~0.4% perf boost with this tuning on spec2k6 for CSSPGO (with pseudo-probe and new inliner).

Differential Revision: https://reviews.llvm.org/D95980
---
 llvm/include/llvm/ProfileData/ProfileCommon.h |   3 +
 .../lib/ProfileData/ProfileSummaryBuilder.cpp |  34 ++++
 llvm/lib/ProfileData/SampleProfReader.cpp     |   6 +-
 llvm/lib/ProfileData/SampleProfWriter.cpp     |   6 +-
 .../Transforms/SampleProfile/csspgo-inline.ll |   1 -
 .../SampleProfile/csspgo-summary.ll           | 153 ++++++++++++++++++
 6 files changed, 192 insertions(+), 11 deletions(-)
 create mode 100644 llvm/test/Transforms/SampleProfile/csspgo-summary.ll

diff --git a/llvm/include/llvm/ProfileData/ProfileCommon.h b/llvm/include/llvm/ProfileData/ProfileCommon.h
index 6bb5825339ae..55b94b2e690d 100644
--- a/llvm/include/llvm/ProfileData/ProfileCommon.h
+++ b/llvm/include/llvm/ProfileData/ProfileCommon.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/IR/ProfileSummary.h"
 #include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/SampleProf.h"
 #include "llvm/Support/Error.h"
 #include <algorithm>
 #include <cstdint>
@@ -89,6 +90,8 @@ class SampleProfileSummaryBuilder final : public ProfileSummaryBuilder {
 
   void addRecord(const sampleprof::FunctionSamples &FS,
                  bool isCallsiteSample = false);
+  std::unique_ptr<ProfileSummary> computeSummaryForProfiles(
+      const StringMap<sampleprof::FunctionSamples> &Profiles);
   std::unique_ptr<ProfileSummary> getSummary();
 };
 
diff --git a/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp b/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp
index d2603097c550..0e03aa50173d 100644
--- a/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp
+++ b/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp
@@ -18,9 +18,14 @@
 #include "llvm/ProfileData/ProfileCommon.h"
 #include "llvm/ProfileData/SampleProf.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
 
+cl::opt<bool> UseContextLessSummary(
+    "profile-summary-contextless", cl::Hidden, cl::init(false), cl::ZeroOrMore,
+    cl::desc("Merge context profiles before calculating thresholds."));
+
 // A set of cutoff values. Each value, when divided by ProfileSummary::Scale
 // (which is 1000000) is a desired percentile of total counts.
 static const uint32_t DefaultCutoffsData[] = {
@@ -111,6 +116,35 @@ std::unique_ptr<ProfileSummary> SampleProfileSummaryBuilder::getSummary() {
       MaxFunctionCount, NumCounts, NumFunctions);
 }
 
+std::unique_ptr<ProfileSummary>
+SampleProfileSummaryBuilder::computeSummaryForProfiles(
+    const StringMap<sampleprof::FunctionSamples> &Profiles) {
+  assert(NumFunctions == 0 &&
+         "This can only be called on an empty summary builder");
+  StringMap<sampleprof::FunctionSamples> ContextLessProfiles;
+  const StringMap<sampleprof::FunctionSamples> *ProfilesToUse = &Profiles;
+  // For CSSPGO, context-sensitive profile effectively split a function profile
+  // into many copies each representing the CFG profile of a particular calling
+  // context. That makes the count distribution looks more flat as we now have
+  // more function profiles each with lower counts, which in turn leads to lower
+  // hot thresholds. To compensate for that, by defauly we merge context
+  // profiles before coumputing profile summary.
+  if (UseContextLessSummary || (sampleprof::FunctionSamples::ProfileIsCS &&
+                                !UseContextLessSummary.getNumOccurrences())) {
+    for (const auto &I : Profiles) {
+      ContextLessProfiles[I.second.getName()].merge(I.second);
+    }
+    ProfilesToUse = &ContextLessProfiles;
+  }
+
+  for (const auto &I : *ProfilesToUse) {
+    const sampleprof::FunctionSamples &Profile = I.second;
+    addRecord(Profile);
+  }
+
+  return getSummary();
+}
+
 std::unique_ptr<ProfileSummary> InstrProfSummaryBuilder::getSummary() {
   computeDetailedSummary();
   return std::make_unique<ProfileSummary>(
diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp
index 370ffc8e2885..38cbca844c87 100644
--- a/llvm/lib/ProfileData/SampleProfReader.cpp
+++ b/llvm/lib/ProfileData/SampleProfReader.cpp
@@ -1610,9 +1610,5 @@ SampleProfileReader::create(std::unique_ptr<MemoryBuffer> &B, LLVMContext &C,
 // profile. Binary format has the profile summary in its header.
 void SampleProfileReader::computeSummary() {
   SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs);
-  for (const auto &I : Profiles) {
-    const FunctionSamples &Profile = I.second;
-    Builder.addRecord(Profile);
-  }
-  Summary = Builder.getSummary();
+  Summary = Builder.computeSummaryForProfiles(Profiles);
 }
diff --git a/llvm/lib/ProfileData/SampleProfWriter.cpp b/llvm/lib/ProfileData/SampleProfWriter.cpp
index d3bc05e06fdf..b388b78dfaca 100644
--- a/llvm/lib/ProfileData/SampleProfWriter.cpp
+++ b/llvm/lib/ProfileData/SampleProfWriter.cpp
@@ -752,9 +752,5 @@ SampleProfileWriter::create(std::unique_ptr<raw_ostream> &OS,
 void SampleProfileWriter::computeSummary(
     const StringMap<FunctionSamples> &ProfileMap) {
   SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs);
-  for (const auto &I : ProfileMap) {
-    const FunctionSamples &Profile = I.second;
-    Builder.addRecord(Profile);
-  }
-  Summary = Builder.getSummary();
+  Summary = Builder.computeSummaryForProfiles(ProfileMap);
 }
diff --git a/llvm/test/Transforms/SampleProfile/csspgo-inline.ll b/llvm/test/Transforms/SampleProfile/csspgo-inline.ll
index 14e916d8c2e8..8303ac299318 100644
--- a/llvm/test/Transforms/SampleProfile/csspgo-inline.ll
+++ b/llvm/test/Transforms/SampleProfile/csspgo-inline.ll
@@ -30,7 +30,6 @@
 
 ; INLINE-NEW-LIMIT1-NOT: remark
 
-; INLINE-NEW-LIMIT2: remark: merged.cpp:27:11: _Z8funcLeafi inlined into _Z5funcAi to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite _Z5funcAi:1:11
 ; INLINE-NEW-LIMIT2: remark: merged.cpp:33:11: _Z8funcLeafi inlined into _Z5funcBi to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite _Z5funcBi:1:11
 ; INLINE-NEW-LIMIT2-NOT: remark
 
diff --git a/llvm/test/Transforms/SampleProfile/csspgo-summary.ll b/llvm/test/Transforms/SampleProfile/csspgo-summary.ll
new file mode 100644
index 000000000000..42ecf399abdb
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/csspgo-summary.ll
@@ -0,0 +1,153 @@
+; Test for CSSPGO's profile summary computation with and without pre-merging context profiles
+
+; RUN: opt < %s -passes=sample-profile,print-profile-summary -sample-profile-file=%S/Inputs/profile-context-tracker.prof -profile-summary-cutoff-hot=999900 -profile-sample-accurate -profile-summary-contextless=0 -S -o /dev/null 2>&1 | FileCheck %s --check-prefix=SUMMARY-UNMERGED
+; RUN: opt < %s -passes=sample-profile,print-profile-summary -sample-profile-file=%S/Inputs/profile-context-tracker.prof -profile-summary-cutoff-hot=999900 -profile-sample-accurate -profile-summary-contextless=1 -S -o /dev/null 2>&1 | FileCheck %s --check-prefix=SUMMARY-MERGED
+
+; SUMMARY-UNMERGED: main :hot entry
+; SUMMARY-MERGED-NOT: main :hot entry
+
+
+@factor = dso_local global i32 3, align 4, !dbg !0
+
+define dso_local i32 @main() local_unnamed_addr #0 !dbg !18 {
+entry:
+  br label %for.body, !dbg !25
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret i32 %add3, !dbg !27
+
+for.body:                                         ; preds = %for.body, %entry
+  %x.011 = phi i32 [ 300000, %entry ], [ %dec, %for.body ]
+  %r.010 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
+  %call = tail call i32 @_Z5funcBi(i32 %x.011), !dbg !32
+  %add = add nuw nsw i32 %x.011, 1, !dbg !31
+  %call1 = tail call i32 @_Z5funcAi(i32 %add), !dbg !28
+  %add2 = add i32 %call, %r.010, !dbg !34
+  %add3 = add i32 %add2, %call1, !dbg !35
+  %dec = add nsw i32 %x.011, -1, !dbg !36
+  %cmp = icmp eq i32 %x.011, 0, !dbg !38
+  br i1 %cmp, label %for.cond.cleanup, label %for.body, !dbg !25
+}
+
+define dso_local i32 @_Z5funcAi(i32 %x) local_unnamed_addr #1 !dbg !40 {
+entry:
+  %add = add nsw i32 %x, 100000, !dbg !44
+  %call = tail call i32 @_Z8funcLeafi(i32 %add), !dbg !45
+  ret i32 %call, !dbg !46
+}
+
+define dso_local i32 @_Z8funcLeafi(i32 %x) local_unnamed_addr #1 !dbg !54 {
+entry:
+  %cmp = icmp sgt i32 %x, 0, !dbg !57
+  br i1 %cmp, label %while.body, label %while.cond2.preheader, !dbg !59
+
+while.cond2.preheader:                            ; preds = %entry
+  %cmp313 = icmp slt i32 %x, 0, !dbg !60
+  br i1 %cmp313, label %while.body4, label %if.end, !dbg !63
+
+while.body:                                       ; preds = %while.body, %entry
+  %x.addr.016 = phi i32 [ %sub, %while.body ], [ %x, %entry ]
+  %tmp = load volatile i32, i32* @factor, align 4, !dbg !64
+  %call = tail call i32 @_Z3fibi(i32 %tmp), !dbg !67
+  %sub = sub nsw i32 %x.addr.016, %call, !dbg !68
+  %cmp1 = icmp sgt i32 %sub, 0, !dbg !69
+  br i1 %cmp1, label %while.body, label %if.end, !dbg !71
+
+while.body4:                                      ; preds = %while.body4, %while.cond2.preheader
+  %x.addr.114 = phi i32 [ %add, %while.body4 ], [ %x, %while.cond2.preheader ]
+  %tmp1 = load volatile i32, i32* @factor, align 4, !dbg !72
+  %call5 = tail call i32 @_Z3fibi(i32 %tmp1), !dbg !74
+  %add = add nsw i32 %call5, %x.addr.114, !dbg !75
+  %cmp3 = icmp slt i32 %add, 0, !dbg !60
+  br i1 %cmp3, label %while.body4, label %if.end, !dbg !63
+
+if.end:                                           ; preds = %while.body4, %while.body, %while.cond2.preheader
+  %x.addr.2 = phi i32 [ 0, %while.cond2.preheader ], [ %sub, %while.body ], [ %add, %while.body4 ]
+  ret i32 %x.addr.2, !dbg !76
+}
+
+define dso_local i32 @_Z5funcBi(i32 %x) local_unnamed_addr #0 !dbg !47 {
+entry:
+  %sub = add nsw i32 %x, -100000, !dbg !51
+  %call = tail call i32 @_Z8funcLeafi(i32 %sub), !dbg !52
+  ret i32 %call, !dbg !53
+}
+
+declare i32 @_Z3fibi(i32)
+
+attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #1 = { nofree norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!14, !15, !16}
+!llvm.ident = !{!17}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "factor", scope: !2, file: !3, line: 21, type: !13, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !5, globals: !12, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!3 = !DIFile(filename: "merged.cpp", directory: "/local/autofdo")
+!4 = !{}
+!5 = !{!6, !10, !11}
+!6 = !DISubprogram(name: "funcA", linkageName: "_Z5funcAi", scope: !3, file: !3, line: 6, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
+!7 = !DISubroutineType(types: !8)
+!8 = !{!9, !9}
+!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!10 = !DISubprogram(name: "funcB", linkageName: "_Z5funcBi", scope: !3, file: !3, line: 7, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
+!11 = !DISubprogram(name: "funcLeaf", linkageName: "_Z8funcLeafi", scope: !3, file: !3, line: 22, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
+!12 = !{!0}
+!13 = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: !9)
+!14 = !{i32 7, !"Dwarf Version", i32 4}
+!15 = !{i32 2, !"Debug Info Version", i32 3}
+!16 = !{i32 1, !"wchar_size", i32 4}
+!17 = !{!"clang version 11.0.0"}
+!18 = distinct !DISubprogram(name: "main", scope: !3, file: !3, line: 11, type: !19, scopeLine: 11, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !21)
+!19 = !DISubroutineType(types: !20)
+!20 = !{!9}
+!21 = !{!22, !23}
+!22 = !DILocalVariable(name: "r", scope: !18, file: !3, line: 12, type: !9)
+!23 = !DILocalVariable(name: "x", scope: !24, file: !3, line: 13, type: !9)
+!24 = distinct !DILexicalBlock(scope: !18, file: !3, line: 13, column: 3)
+!25 = !DILocation(line: 13, column: 3, scope: !26)
+!26 = !DILexicalBlockFile(scope: !24, file: !3, discriminator: 2)
+!27 = !DILocation(line: 17, column: 3, scope: !18)
+!28 = !DILocation(line: 14, column: 10, scope: !29)
+!29 = distinct !DILexicalBlock(scope: !30, file: !3, line: 13, column: 37)
+!30 = distinct !DILexicalBlock(scope: !24, file: !3, line: 13, column: 3)
+!31 = !DILocation(line: 14, column: 29, scope: !29)
+!32 = !DILocation(line: 14, column: 21, scope: !33)
+!33 = !DILexicalBlockFile(scope: !29, file: !3, discriminator: 2)
+!34 = !DILocation(line: 14, column: 19, scope: !29)
+!35 = !DILocation(line: 14, column: 7, scope: !29)
+!36 = !DILocation(line: 13, column: 33, scope: !37)
+!37 = !DILexicalBlockFile(scope: !30, file: !3, discriminator: 6)
+!38 = !DILocation(line: 13, column: 26, scope: !39)
+!39 = !DILexicalBlockFile(scope: !30, file: !3, discriminator: 2)
+!40 = distinct !DISubprogram(name: "funcA", linkageName: "_Z5funcAi", scope: !3, file: !3, line: 26, type: !7, scopeLine: 26, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!44 = !DILocation(line: 27, column: 22, scope: !40)
+!45 = !DILocation(line: 27, column: 11, scope: !40)
+!46 = !DILocation(line: 29, column: 3, scope: !40)
+!47 = distinct !DISubprogram(name: "funcB", linkageName: "_Z5funcBi", scope: !3, file: !3, line: 32, type: !7, scopeLine: 32, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!51 = !DILocation(line: 33, column: 22, scope: !47)
+!52 = !DILocation(line: 33, column: 11, scope: !47)
+!53 = !DILocation(line: 35, column: 3, scope: !47)
+!54 = distinct !DISubprogram(name: "funcLeaf", linkageName: "_Z8funcLeafi", scope: !3, file: !3, line: 48, type: !7, scopeLine: 48, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!57 = !DILocation(line: 49, column: 9, scope: !58)
+!58 = distinct !DILexicalBlock(scope: !54, file: !3, line: 49, column: 7)
+!59 = !DILocation(line: 49, column: 7, scope: !54)
+!60 = !DILocation(line: 58, column: 14, scope: !61)
+!61 = !DILexicalBlockFile(scope: !62, file: !3, discriminator: 2)
+!62 = distinct !DILexicalBlock(scope: !58, file: !3, line: 56, column: 8)
+!63 = !DILocation(line: 58, column: 5, scope: !61)
+!64 = !DILocation(line: 52, column: 16, scope: !65)
+!65 = distinct !DILexicalBlock(scope: !66, file: !3, line: 51, column: 19)
+!66 = distinct !DILexicalBlock(scope: !58, file: !3, line: 49, column: 14)
+!67 = !DILocation(line: 52, column: 12, scope: !65)
+!68 = !DILocation(line: 52, column: 9, scope: !65)
+!69 = !DILocation(line: 51, column: 14, scope: !70)
+!70 = !DILexicalBlockFile(scope: !66, file: !3, discriminator: 2)
+!71 = !DILocation(line: 51, column: 5, scope: !70)
+!72 = !DILocation(line: 59, column: 16, scope: !73)
+!73 = distinct !DILexicalBlock(scope: !62, file: !3, line: 58, column: 19)
+!74 = !DILocation(line: 59, column: 12, scope: !73)
+!75 = !DILocation(line: 59, column: 9, scope: !73)
+!76 = !DILocation(line: 63, column: 3, scope: !54)

From e8e45f52d0a8268fe3ee2a3a2afc80bc10a47280 Mon Sep 17 00:00:00 2001
From: Hongtao Yu <hoy@fb.com>
Date: Sun, 7 Feb 2021 22:49:20 -0800
Subject: [PATCH 143/318] [CSSPGO] Unblock optimizations with pseudo probe
 instrumentation.

The IR/MIR pseudo probe intrinsics don't get materialized into real machine instructions and therefore they don't incur runtime cost directly. However, they come with indirect cost by blocking certain optimizations. Some of the blocking are intentional (such as blocking code merge) for better counts quality while the others are accidental. This change unblocks perf-critical optimizations that do not affect counts quality. They include:

1. IR InstCombine, sinking load operation to shorten lifetimes.
2. MIR LiveRangeShrink, similar to #1
3. MIR TwoAddressInstructionPass, i.e, opeq transform
4. MIR function argument copy elision
5. IR stack protection. (though not perf-critical but nice to have).

Reviewed By: wmi

Differential Revision: https://reviews.llvm.org/D95982
---
 llvm/include/llvm/CodeGen/MachineInstr.h      |  7 ++
 llvm/include/llvm/IR/Instruction.h            |  3 +
 llvm/lib/CodeGen/LiveRangeShrink.cpp          |  3 +-
 llvm/lib/CodeGen/MachineInstr.cpp             |  3 +-
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  5 +-
 llvm/lib/CodeGen/StackProtector.cpp           |  2 +-
 .../lib/CodeGen/TwoAddressInstructionPass.cpp |  8 +--
 llvm/lib/IR/Instruction.cpp                   |  4 ++
 llvm/lib/Transforms/IPO/FunctionAttrs.cpp     |  7 ++
 .../Transforms/InstCombine/InstCombinePHI.cpp |  8 ++-
 .../InstCombine/InstructionCombining.cpp      |  7 +-
 .../SampleProfile/pseudo-probe-instcombine.ll | 66 +++++++++++++++++++
 .../SampleProfile/pseudo-probe-instsched.ll   | 33 ++++++++++
 .../SampleProfile/pseudo-probe-peep.ll        | 29 ++++++++
 .../SampleProfile/pseudo-probe-twoaddr.ll     | 37 +++++++++++
 15 files changed, 209 insertions(+), 13 deletions(-)
 create mode 100644 llvm/test/Transforms/SampleProfile/pseudo-probe-instcombine.ll
 create mode 100644 llvm/test/Transforms/SampleProfile/pseudo-probe-instsched.ll
 create mode 100644 llvm/test/Transforms/SampleProfile/pseudo-probe-peep.ll
 create mode 100644 llvm/test/Transforms/SampleProfile/pseudo-probe-twoaddr.ll

diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h
index 6bbe2d03f9e5..f8d97c2c07a6 100644
--- a/llvm/include/llvm/CodeGen/MachineInstr.h
+++ b/llvm/include/llvm/CodeGen/MachineInstr.h
@@ -1156,6 +1156,10 @@ class MachineInstr
     return getOpcode() == TargetOpcode::CFI_INSTRUCTION;
   }
 
+  bool isPseudoProbe() const {
+    return getOpcode() == TargetOpcode::PSEUDO_PROBE;
+  }
+  
   // True if the instruction represents a position in the function.
   bool isPosition() const { return isLabel() || isCFIInstruction(); }
 
@@ -1165,6 +1169,9 @@ class MachineInstr
   bool isDebugInstr() const {
     return isDebugValue() || isDebugLabel() || isDebugRef();
   }
+  bool isDebugOrPseudoInstr() const {
+    return isDebugInstr() || isPseudoProbe();
+  }
 
   bool isDebugOffsetImm() const { return getDebugOffset().isImm(); }
 
diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h
index 85afaed5225e..b99dc62bbb9d 100644
--- a/llvm/include/llvm/IR/Instruction.h
+++ b/llvm/include/llvm/IR/Instruction.h
@@ -654,6 +654,9 @@ class Instruction : public User,
   /// llvm.lifetime.end marker.
   bool isLifetimeStartOrEnd() const;
 
+  /// Return true if the instruction is a DbgInfoIntrinsic or PseudoProbeInst.
+  bool isDebugOrPseudoInst() const;
+
   /// Return a pointer to the next non-debug instruction in the same basic
   /// block as 'this', or nullptr if no such instruction exists. Skip any pseudo
   /// operations if \c SkipPseudoOp is true.
diff --git a/llvm/lib/CodeGen/LiveRangeShrink.cpp b/llvm/lib/CodeGen/LiveRangeShrink.cpp
index 26439a656917..7fa14fd902ef 100644
--- a/llvm/lib/CodeGen/LiveRangeShrink.cpp
+++ b/llvm/lib/CodeGen/LiveRangeShrink.cpp
@@ -156,7 +156,8 @@ bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) {
         // If MI has side effects, it should become a barrier for code motion.
         // IOM is rebuild from the next instruction to prevent later
         // instructions from being moved before this MI.
-        if (MI.hasUnmodeledSideEffects() && Next != MBB.end()) {
+        if (MI.hasUnmodeledSideEffects() && !MI.isPseudoProbe() &&
+            Next != MBB.end()) {
           BuildInstOrderMap(Next, IOM);
           SawStore = false;
         }
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index 59d98054e3a2..b6cfd7dcbfbc 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -1462,7 +1462,8 @@ bool MachineInstr::hasUnmodeledSideEffects() const {
 }
 
 bool MachineInstr::isLoadFoldBarrier() const {
-  return mayStore() || isCall() || hasUnmodeledSideEffects();
+  return mayStore() || isCall() ||
+         (hasUnmodeledSideEffects() && !isPseudoProbe());
 }
 
 /// allDefsAreDead - Return true if all the defs of this instruction are dead.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 6638ff6a6358..a6bd774934ac 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -9660,8 +9660,9 @@ findArgumentCopyElisionCandidates(const DataLayout &DL,
       // We will look through cast uses, so ignore them completely.
       if (I.isCast())
         continue;
-      // Ignore debug info intrinsics, they don't escape or store to allocas.
-      if (isa<DbgInfoIntrinsic>(I))
+      // Ignore debug info and pseudo op intrinsics, they don't escape or store
+      // to allocas.
+      if (I.isDebugOrPseudoInst())
         continue;
       // This is an unknown instruction. Assume it escapes or writes to all
       // static alloca operands.
diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp
index 0411faabbcc3..8d91afb6e99d 100644
--- a/llvm/lib/CodeGen/StackProtector.cpp
+++ b/llvm/lib/CodeGen/StackProtector.cpp
@@ -192,7 +192,7 @@ bool StackProtector::HasAddressTaken(const Instruction *AI,
       // Ignore intrinsics that do not become real instructions.
       // TODO: Narrow this to intrinsics that have store-like effects.
       const auto *CI = cast<CallInst>(I);
-      if (!isa<DbgInfoIntrinsic>(CI) && !CI->isLifetimeStartOrEnd())
+      if (!CI->isDebugOrPseudoInst() && !CI->isLifetimeStartOrEnd())
         return true;
       break;
     }
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index ecee4aed7f88..2a9132bd2fe0 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -801,8 +801,8 @@ bool TwoAddressInstructionPass::rescheduleMIBelowKill(
   MachineBasicBlock::iterator KillPos = KillMI;
   ++KillPos;
   for (MachineInstr &OtherMI : make_range(End, KillPos)) {
-    // Debug instructions cannot be counted against the limit.
-    if (OtherMI.isDebugInstr())
+    // Debug or pseudo instructions cannot be counted against the limit.
+    if (OtherMI.isDebugOrPseudoInstr())
       continue;
     if (NumVisited > 10)  // FIXME: Arbitrary limit to reduce compile time cost.
       return false;
@@ -974,8 +974,8 @@ bool TwoAddressInstructionPass::rescheduleKillAboveMI(
   unsigned NumVisited = 0;
   for (MachineInstr &OtherMI :
        make_range(mi, MachineBasicBlock::iterator(KillMI))) {
-    // Debug instructions cannot be counted against the limit.
-    if (OtherMI.isDebugInstr())
+    // Debug or pseudo instructions cannot be counted against the limit.
+    if (OtherMI.isDebugOrPseudoInstr())
       continue;
     if (NumVisited > 10)  // FIXME: Arbitrary limit to reduce compile time cost.
       return false;
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 246180e72172..8e52dd3ddc71 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -651,6 +651,10 @@ bool Instruction::isLifetimeStartOrEnd() const {
   return ID == Intrinsic::lifetime_start || ID == Intrinsic::lifetime_end;
 }
 
+bool Instruction::isDebugOrPseudoInst() const {
+  return isa<DbgInfoIntrinsic>(this) || isa<PseudoProbeInst>(this);
+}
+
 const Instruction *
 Instruction::getNextNonDebugInstruction(bool SkipPseudoOp) const {
   for (const Instruction *I = getNextNode(); I; I = I->getNextNode())
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index 256acd7e1d17..6730824e860a 100644
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -149,6 +149,13 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
       if (isNoModRef(MRI))
         continue;
 
+      // A pseudo probe call shouldn't change any function attribute since it
+      // doesn't translate to a real instruction. It comes with a memory access
+      // tag to prevent itself being removed by optimizations and not block
+      // other instructions being optimized.
+      if (isa<PseudoProbeInst>(I))
+        continue;
+
       if (!AliasAnalysis::onlyAccessesArgPointees(MRB)) {
         // The call could access any memory. If that includes writes, note it.
         if (isModSet(MRI))
diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
index d687ec654438..b211b0813611 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -592,8 +592,14 @@ static bool isSafeAndProfitableToSinkLoad(LoadInst *L) {
   BasicBlock::iterator BBI = L->getIterator(), E = L->getParent()->end();
 
   for (++BBI; BBI != E; ++BBI)
-    if (BBI->mayWriteToMemory())
+    if (BBI->mayWriteToMemory()) {
+      // Calls that only access inaccessible memory do not block sinking the
+      // load.
+      if (auto *CB = dyn_cast<CallBase>(BBI))
+        if (CB->onlyAccessesInaccessibleMemory())
+          continue;
       return false;
+    }
 
   // Check for non-address taken alloca.  If not address-taken already, it isn't
   // profitable to do this xform.
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 518e909e8ab4..828fd49524ec 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -3878,9 +3878,10 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
         }
       }
 
-      // Skip processing debug intrinsics in InstCombine. Processing these call instructions
-      // consumes non-trivial amount of time and provides no value for the optimization.
-      if (!isa<DbgInfoIntrinsic>(Inst)) {
+      // Skip processing debug and pseudo intrinsics in InstCombine. Processing
+      // these call instructions consumes non-trivial amount of time and
+      // provides no value for the optimization.
+      if (!Inst->isDebugOrPseudoInst()) {
         InstrsForInstCombineWorklist.push_back(Inst);
         SeenAliasScopes.analyse(Inst);
       }
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-instcombine.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-instcombine.ll
new file mode 100644
index 000000000000..e5bb7bc541c6
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-instcombine.ll
@@ -0,0 +1,66 @@
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+%struct.nonbonded = type { [2 x %struct.CompAtom*], [2 x %struct.CompAtomExt*], [2 x %struct.CompAtom*], [2 x %class.Vector*], [2 x %class.Vector*], [2 x i32], %class.Vector, double*, double*, %class.ComputeNonbondedWorkArrays*, %class.Pairlists*, i32, i32, double, double, i32, i32, i32, i32 }
+%struct.CompAtomExt = type { i32 }
+%struct.CompAtom = type { %class.Vector, float, i16, i8, i8 }
+%class.Vector = type { double, double, double }
+%class.ComputeNonbondedWorkArrays = type { %class.ResizeArray, %class.ResizeArray.0, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray.2, %class.ResizeArray.2 }
+%class.ResizeArray.0 = type { i32 (...)**, %class.ResizeArrayRaw.1* }
+%class.ResizeArrayRaw.1 = type <{ double*, i8*, i32, i32, i32, float, i32, [4 x i8] }>
+%class.ResizeArray = type { i32 (...)**, %class.ResizeArrayRaw* }
+%class.ResizeArrayRaw = type <{ i16*, i8*, i32, i32, i32, float, i32, [4 x i8] }>
+%class.ResizeArray.2 = type { i32 (...)**, %class.ResizeArrayRaw.3* }
+%class.ResizeArrayRaw.3 = type <{ %class.Vector*, i8*, i32, i32, i32, float, i32, [4 x i8] }>
+%class.Pairlists = type { i16*, i32, i32 }
+
+;; Check the minPart4 and minPart assignments are merged.
+; CHECK-COUNT-1: getelementptr inbounds %struct.nonbonded, %struct.nonbonded* %params, i64 0, i32 16
+; CHECK-NOT: getelementptr inbounds %struct.nonbonded, %struct.nonbonded* %params, i64 0, i32 16
+
+define dso_local void @_ZN20ComputeNonbondedUtil9calc_pairEP9nonbonded(%struct.nonbonded* nocapture readonly %params) local_unnamed_addr align 2 {
+entry:
+  %savePairlists3 = getelementptr inbounds %struct.nonbonded, %struct.nonbonded* %params, i64 0, i32 11
+  %0 = load i32, i32* %savePairlists3, align 8
+  %usePairlists4 = getelementptr inbounds %struct.nonbonded, %struct.nonbonded* %params, i64 0, i32 12
+  %1 = load i32, i32* %usePairlists4, align 4
+  %tobool54.not = icmp eq i32 %0, 0
+  br i1 %tobool54.not, label %lor.lhs.false55, label %if.end109
+
+lor.lhs.false55:                                  ; preds = %entry
+  %tobool56.not = icmp eq i32 %1, 0
+  br i1 %tobool56.not, label %if.end109, label %if.end109.thread
+
+if.end109.thread:                                 ; preds = %lor.lhs.false55
+  %minPart4 = getelementptr inbounds %struct.nonbonded, %struct.nonbonded* %params, i64 0, i32 16
+  %2 = load i32, i32* %minPart4, align 4
+  call void @llvm.pseudoprobe(i64 -6172701105289426098, i64 2, i32 0, i64 -1)
+  br label %if.then138
+
+if.end109:                                        ; preds = %lor.lhs.false55, %entry
+  %minPart = getelementptr inbounds %struct.nonbonded, %struct.nonbonded* %params, i64 0, i32 16
+  %3 = load i32, i32* %minPart, align 4
+  call void @llvm.pseudoprobe(i64 -6172701105289426098, i64 3, i32 0, i64 -1)
+  %tobool116.not = icmp eq i32 %1, 0
+  br i1 %tobool116.not, label %if.then117, label %if.then138
+
+if.then117:                                       ; preds = %if.end109
+  ret void
+
+if.then138:                                       ; preds = %if.end109.thread, %if.end109
+  %4 = phi i32 [ %2, %if.end109.thread ], [ %3, %if.end109 ]
+  %tobool139.not = icmp eq i32 %4, 0
+  br i1 %tobool139.not, label %if.else147, label %if.then140
+
+if.then140:                                       ; preds = %if.then138
+  ret void
+
+if.else147:                                       ; preds = %if.then138
+  ret void
+}
+
+declare dso_local void @_ZN9Pairlists8addIndexEv() align 2
+
+; Function Attrs: inaccessiblememonly nounwind willreturn
+declare void @llvm.pseudoprobe(i64, i64, i32, i64) #0
+
+attributes #0 = { inaccessiblememonly nounwind willreturn }
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-instsched.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-instsched.ll
new file mode 100644
index 000000000000..609af90db610
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-instsched.ll
@@ -0,0 +1,33 @@
+; PR1075
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-apple-darwin -pseudo-probe-for-profiling -O3 | FileCheck %s
+
+define float @foo(float %x) #0 {
+  %tmp1 = fmul float %x, 3.000000e+00
+  %tmp3 = fmul float %x, 5.000000e+00
+  %tmp5 = fmul float %x, 7.000000e+00
+  %tmp7 = fmul float %x, 1.100000e+01
+  call void @llvm.pseudoprobe(i64 6699318081062747564, i64 1, i32 0, i64 -1)
+  %tmp10 = fadd float %tmp1, %tmp3
+  %tmp12 = fadd float %tmp10, %tmp5
+  %tmp14 = fadd float %tmp12, %tmp7
+  ret float %tmp14
+; CHECK: mulss
+; CHECK: mulss
+; CHECK: addss
+; CHECK: mulss
+; CHECK: addss
+; CHECK: mulss
+; CHECK: addss
+; CHECK: ret
+}
+
+; Function Attrs: inaccessiblememonly nounwind willreturn
+declare void @llvm.pseudoprobe(i64, i64, i32, i64) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { inaccessiblememonly nounwind willreturn }
+
+!llvm.pseudo_probe_desc = !{!0}
+
+!0 = !{i64 6699318081062747564, i64 4294967295, !"foo", null}
+
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-peep.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-peep.ll
new file mode 100644
index 000000000000..a1fb25c95936
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-peep.ll
@@ -0,0 +1,29 @@
+; RUN: llc -mtriple=x86_64-- -stop-after=peephole-opt -o - %s | FileCheck %s
+
+define internal i32 @arc_compare() {
+entry:
+  %0 = load i64, i64* undef, align 8
+  br i1 undef, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+; Chek a register copy has been sinked into the compare instruction.
+; CHECK: %[[#REG:]]:gr64 = IMPLICIT_DEF 
+; CHECK-NOT: %[[#]]:gr64 = MOV64rm %[[#REG]]
+; CHECK: PSEUDO_PROBE 5116412291814990879, 3, 0, 0
+; CHECK: CMP64mr %[[#REG]], 1
+  call void @llvm.pseudoprobe(i64 5116412291814990879, i64 3, i32 0, i64 -1)
+  %cmp4 = icmp slt i64 %0, undef
+  br i1 %cmp4, label %return, label %if.end6
+
+if.end6:                                          ; preds = %if.end
+  call void @llvm.pseudoprobe(i64 5116412291814990879, i64 5, i32 0, i64 -1)
+  br label %return
+
+return:                                           ; preds = %if.end6, %if.end, %entry
+  ret i32 undef
+}
+
+; Function Attrs: inaccessiblememonly nounwind willreturn
+declare void @llvm.pseudoprobe(i64, i64, i32, i64) #0
+
+attributes #0 = { inaccessiblememonly nounwind willreturn }
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-twoaddr.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-twoaddr.ll
new file mode 100644
index 000000000000..81f72d3c5871
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-twoaddr.ll
@@ -0,0 +1,37 @@
+; RUN: llc -stop-after=twoaddressinstruction -mtriple=x86_64-- -o - %s | FileCheck %s
+
+
+define dso_local double @twoaddressinstruction() local_unnamed_addr {
+for.end:
+  %0 = load i64, i64* undef, align 8
+  br label %for.body14.preheader
+
+for.body14.preheader:                             ; preds = %for.end
+  br i1 undef, label %for.cond25.preheader.loopexit.unr-lcssa, label %for.body14.preheader.new
+
+for.body14.preheader.new:                         ; preds = %for.body14.preheader
+  %unroll_iter136 = and i64 %0, -4
+  br label %for.body14
+
+for.cond25.preheader.loopexit.unr-lcssa:          ; preds = %for.body14, %for.body14.preheader
+  %indvars.iv127.unr = phi i64 [ 1, %for.body14.preheader ], [ %indvars.iv.next128.3, %for.body14 ]
+  ret double undef
+
+for.body14:                                       ; preds = %for.body14, %for.body14.preheader.new
+  %indvars.iv127 = phi i64 [ 1, %for.body14.preheader.new ], [ %indvars.iv.next128.3, %for.body14 ]
+  %niter137 = phi i64 [ %unroll_iter136, %for.body14.preheader.new ], [ %niter137.nsub.3, %for.body14 ]
+  %indvars.iv.next128.3 = add nuw nsw i64 %indvars.iv127, 4
+; CHECK: PSEUDO_PROBE -6878943695821059507, 9, 0, 0
+  call void @llvm.pseudoprobe(i64 -6878943695821059507, i64 9, i32 0, i64 -1)
+;; Check an opeq form of instruction is created.
+; CHECK: %[[#REG:]]:gr64_nosp = COPY killed %[[#]]
+; CHECK: %[[#REG]]:gr64_nosp = nuw ADD64ri8 %[[#REG]], 4, implicit-def dead $eflags
+  %niter137.nsub.3 = add i64 %niter137, -4
+  %niter137.ncmp.3 = icmp eq i64 %niter137.nsub.3, 0
+  br i1 %niter137.ncmp.3, label %for.cond25.preheader.loopexit.unr-lcssa, label %for.body14
+}
+
+; Function Attrs: inaccessiblememonly nounwind willreturn
+declare void @llvm.pseudoprobe(i64, i64, i32, i64) #0
+
+attributes #0 = { inaccessiblememonly nounwind willreturn }
\ No newline at end of file

From 1a5bb1e4d540303554c0e891389f699956e5e03b Mon Sep 17 00:00:00 2001
From: Hongtao Yu <hoy@fb.com>
Date: Wed, 10 Feb 2021 14:40:47 -0800
Subject: [PATCH 144/318] [CSSPGO] Restrict pseudo probe tests to x86_64 only.

---
 llvm/test/Transforms/SampleProfile/pseudo-probe-instsched.ll | 4 ++--
 llvm/test/Transforms/SampleProfile/pseudo-probe-peep.ll      | 1 +
 llvm/test/Transforms/SampleProfile/pseudo-probe-twoaddr.ll   | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-instsched.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-instsched.ll
index 609af90db610..9d89cad43aa7 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-instsched.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-instsched.ll
@@ -1,5 +1,5 @@
-; PR1075
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-apple-darwin -pseudo-probe-for-profiling -O3 | FileCheck %s
+; REQUIRES: x86_64-linux
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-- -pseudo-probe-for-profiling -O3 | FileCheck %s
 
 define float @foo(float %x) #0 {
   %tmp1 = fmul float %x, 3.000000e+00
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-peep.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-peep.ll
index a1fb25c95936..d94dac4de95d 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-peep.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-peep.ll
@@ -1,3 +1,4 @@
+; REQUIRES: x86_64-linux
 ; RUN: llc -mtriple=x86_64-- -stop-after=peephole-opt -o - %s | FileCheck %s
 
 define internal i32 @arc_compare() {
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-twoaddr.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-twoaddr.ll
index 81f72d3c5871..31b471ea08fd 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-twoaddr.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-twoaddr.ll
@@ -1,3 +1,4 @@
+; REQUIRES: x86_64-linux
 ; RUN: llc -stop-after=twoaddressinstruction -mtriple=x86_64-- -o - %s | FileCheck %s
 
 
From 1f5e2016be9a01e4294dcdd10b3c7b03826b26a1 Mon Sep 17 00:00:00 2001
From: Hongtao Yu <hoy@fb.com>
Date: Tue, 9 Feb 2021 09:17:20 -0800
Subject: [PATCH 145/318] [CSSPGO] Process functions in a top-down order on a
 dynamic call graph.

Functions are currently processed by the sample profiler loader in a top-down order defined by the static call graph. The order is being adjusted to be a top-down order based on the input context-sensitive profile. One benefit is that the processing order of caller and callee in one SCC would follow the context order in the profile to favor more inlining. Another benefit is that the processing order of caller and callee through an indirect call (which is not on the static call graph) can be honored which in turn allows for more inlining.

The profile top-down order for SCC is also extended to support non-CS profiles.

Two switches `-mllvm -use-profile-indirect-call-edges` and `-mllvm -use-profile-top-down-order` are being introduced.

Reviewed By: wmi

Differential Revision: https://reviews.llvm.org/D95988
---
 .../Transforms/IPO/SampleContextTracker.h     |  13 +-
 .../Transforms/IPO/SampleContextTracker.cpp   |  32 +++
 llvm/lib/Transforms/IPO/SampleProfile.cpp     | 137 ++++++++++++-
 .../Inputs/profile-context-order.prof         |  38 ++++
 .../Inputs/profile-topdown-order.prof         |  36 ++++
 .../SampleProfile/profile-context-order.ll    | 190 ++++++++++++++++++
 .../SampleProfile/profile-topdown-order.ll    | 179 +++++++++++++++++
 7 files changed, 620 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/Transforms/SampleProfile/Inputs/profile-context-order.prof
 create mode 100644 llvm/test/Transforms/SampleProfile/Inputs/profile-topdown-order.prof
 create mode 100644 llvm/test/Transforms/SampleProfile/profile-context-order.ll
 create mode 100644 llvm/test/Transforms/SampleProfile/profile-topdown-order.ll

diff --git a/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h b/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h
index 526e141838c4..da0bdae0eaee 100644
--- a/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h
+++ b/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/CallGraph.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/ProfileData/SampleProf.h"
@@ -90,6 +91,8 @@ class ContextTrieNode {
 // calling context and the context is identified by path from root to the node.
 class SampleContextTracker {
 public:
+  using ContextSamplesTy = SmallSet<FunctionSamples *, 16>;
+
   SampleContextTracker(StringMap<FunctionSamples> &Profiles);
   // Query context profile for a specific callee with given name at a given
   // call-site. The full context is identified by location of call instruction.
@@ -103,6 +106,9 @@ class SampleContextTracker {
   FunctionSamples *getContextSamplesFor(const DILocation *DIL);
   // Query context profile for a given sample contxt of a function.
   FunctionSamples *getContextSamplesFor(const SampleContext &Context);
+  // Get all context profile for given function.
+  ContextSamplesTy &getAllContextSamplesFor(const Function &Func);
+  ContextSamplesTy &getAllContextSamplesFor(StringRef Name);
   // Query base profile for a given function. A base profile is a merged view
   // of all context profiles for contexts that are not inlined.
   FunctionSamples *getBaseSamplesFor(const Function &Func,
@@ -113,6 +119,9 @@ class SampleContextTracker {
   // This makes sure that inlined context profile will be excluded in
   // function's base profile.
   void markContextSamplesInlined(const FunctionSamples *InlinedSamples);
+  void promoteMergeContextSamplesTree(const Instruction &Inst,
+                                      StringRef CalleeName);
+  void addCallGraphEdges(CallGraph &CG, StringMap<Function *> &SymbolMap);
   // Dump the internal context profile trie.
   void dump();
 
@@ -126,8 +135,6 @@ class SampleContextTracker {
   ContextTrieNode *getTopLevelContextNode(StringRef FName);
   ContextTrieNode &addTopLevelContextNode(StringRef FName);
   ContextTrieNode &promoteMergeContextSamplesTree(ContextTrieNode &NodeToPromo);
-  void promoteMergeContextSamplesTree(const Instruction &Inst,
-                                      StringRef CalleeName);
   void mergeContextNode(ContextTrieNode &FromNode, ContextTrieNode &ToNode,
                         StringRef ContextStrToRemove);
   ContextTrieNode &promoteMergeContextSamplesTree(ContextTrieNode &FromNode,
@@ -135,7 +142,7 @@ class SampleContextTracker {
                                                   StringRef ContextStrToRemove);
 
   // Map from function name to context profiles (excluding base profile)
-  StringMap<SmallSet<FunctionSamples *, 16>> FuncToCtxtProfileSet;
+  StringMap<ContextSamplesTy> FuncToCtxtProfileSet;
 
   // Root node for context trie tree
   ContextTrieNode RootContext;
diff --git a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
index 41d7f363e1a4..158fa0771c3b 100644
--- a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
+++ b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
@@ -263,6 +263,17 @@ SampleContextTracker::getContextSamplesFor(const SampleContext &Context) {
   return Node->getFunctionSamples();
 }
 
+SampleContextTracker::ContextSamplesTy &
+SampleContextTracker::getAllContextSamplesFor(const Function &Func) {
+  StringRef CanonName = FunctionSamples::getCanonicalFnName(Func);
+  return FuncToCtxtProfileSet[CanonName];
+}
+
+SampleContextTracker::ContextSamplesTy &
+SampleContextTracker::getAllContextSamplesFor(StringRef Name) {
+  return FuncToCtxtProfileSet[Name];
+}
+
 FunctionSamples *SampleContextTracker::getBaseSamplesFor(const Function &Func,
                                                          bool MergeContext) {
   StringRef CanonName = FunctionSamples::getCanonicalFnName(Func);
@@ -550,4 +561,25 @@ ContextTrieNode &SampleContextTracker::promoteMergeContextSamplesTree(
   return *ToNode;
 }
 
+// Replace call graph edges with dynamic call edges from the profile.
+void SampleContextTracker::addCallGraphEdges(CallGraph &CG,
+                                             StringMap<Function *> &SymbolMap) {
+  // Add profile call edges to the call graph.
+  std::queue<ContextTrieNode *> NodeQueue;
+  NodeQueue.push(&RootContext);
+  while (!NodeQueue.empty()) {
+    ContextTrieNode *Node = NodeQueue.front();
+    NodeQueue.pop();
+    Function *F = SymbolMap.lookup(Node->getFuncName());
+    for (auto &I : Node->getAllChildContext()) {
+      ContextTrieNode *ChildNode = &I.second;
+      NodeQueue.push(ChildNode);
+      if (F && !F->isDeclaration()) {
+        Function *Callee = SymbolMap.lookup(ChildNode->getFuncName());
+        if (Callee && !Callee->isDeclaration())
+          CG[F]->addCalledFunction(nullptr, CG[Callee]);
+      }
+    }
+  }
+}
 } // namespace llvm
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index b2a9127773c3..a6a419bfe742 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -177,6 +177,16 @@ static cl::opt<bool> ProfileTopDownLoad(
              "order of call graph during sample profile loading. It only "
              "works for new pass manager. "));
 
+static cl::opt<bool> UseProfileIndirectCallEdges(
+    "use-profile-indirect-call-edges", cl::init(true), cl::Hidden,
+    cl::desc("Considering indirect call samples from profile when top-down "
+             "processing functions. Only CSSPGO is supported."));
+
+static cl::opt<bool> UseProfileTopDownOrder(
+    "use-profile-top-down-order", cl::init(false), cl::Hidden,
+    cl::desc("Process functions in one SCC in a top-down order "
+             "based on the input profile."));
+
 static cl::opt<bool> ProfileSizeInline(
     "sample-profile-inline-size", cl::Hidden, cl::init(false),
     cl::desc("Inline cold call sites in profile loader if it's beneficial "
@@ -458,6 +468,8 @@ class SampleProfileLoader {
   uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge);
   void buildEdges(Function &F);
   std::vector<Function *> buildFunctionOrder(Module &M, CallGraph *CG);
+  void addCallGraphEdges(CallGraph &CG, const FunctionSamples &Samples);
+  void replaceCallGraphEdges(CallGraph &CG, StringMap<Function *> &SymbolMap);
   bool propagateThroughEdges(Function &F, bool UpdateBlockCount);
   void computeDominanceAndLoopInfo(Function &F);
   void clearFunctionData();
@@ -2278,6 +2290,45 @@ INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
 INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile",
                     "Sample Profile loader", false, false)
 
+// Add inlined profile call edges to the call graph.
+void SampleProfileLoader::addCallGraphEdges(CallGraph &CG,
+                                            const FunctionSamples &Samples) {
+  Function *Caller = SymbolMap.lookup(Samples.getFuncName());
+  if (!Caller || Caller->isDeclaration())
+    return;
+
+  // Skip non-inlined call edges which are not important since top down inlining
+  // for non-CS profile is to get more precise profile matching, not to enable
+  // more inlining.
+
+  for (const auto &CallsiteSamples : Samples.getCallsiteSamples()) {
+    for (const auto &InlinedSamples : CallsiteSamples.second) {
+      Function *Callee = SymbolMap.lookup(InlinedSamples.first);
+      if (Callee && !Callee->isDeclaration())
+        CG[Caller]->addCalledFunction(nullptr, CG[Callee]);
+      addCallGraphEdges(CG, InlinedSamples.second);
+    }
+  }
+}
+
+// Replace call graph edges with dynamic call edges from the profile.
+void SampleProfileLoader::replaceCallGraphEdges(
+    CallGraph &CG, StringMap<Function *> &SymbolMap) {
+  // Remove static call edges from the call graph except for the ones from the
+  // root which make the call graph connected.
+  for (const auto &Node : CG)
+    if (Node.second.get() != CG.getExternalCallingNode())
+      Node.second->removeAllCalledFunctions();
+
+  // Add profile call edges to the call graph.
+  if (ProfileIsCS) {
+    ContextTracker->addCallGraphEdges(CG, SymbolMap);
+  } else {
+    for (const auto &Samples : Reader->getProfiles())
+      addCallGraphEdges(CG, Samples.second);
+  }
+}
+
 std::vector<Function *>
 SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) {
   std::vector<Function *> FunctionOrderList;
@@ -2300,16 +2351,97 @@ SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) {
   }
 
   assert(&CG->getModule() == &M);
+
+  // Add indirect call edges from profile to augment the static call graph.
+  // Functions will be processed in a top-down order defined by the static call
+  // graph. Adjusting the order by considering indirect call edges from the
+  // profile (which don't exist in the static call graph) can enable the
+  // inlining of indirect call targets by processing the caller before them.
+  // TODO: enable this for non-CS profile and fix the counts returning logic to
+  // have a full support for indirect calls.
+  if (UseProfileIndirectCallEdges && ProfileIsCS) {
+    for (auto &Entry : *CG) {
+      const auto *F = Entry.first;
+      if (!F || F->isDeclaration() || !F->hasFnAttribute("use-sample-profile"))
+        continue;
+      auto &AllContexts = ContextTracker->getAllContextSamplesFor(F->getName());
+      if (AllContexts.empty())
+        continue;
+
+      for (const auto &BB : *F) {
+        for (const auto &I : BB.getInstList()) {
+          const auto *CB = dyn_cast<CallBase>(&I);
+          if (!CB || !CB->isIndirectCall())
+            continue;
+          const DebugLoc &DLoc = I.getDebugLoc();
+          if (!DLoc)
+            continue;
+          auto CallSite = FunctionSamples::getCallSiteIdentifier(DLoc);
+          for (FunctionSamples *Samples : AllContexts) {
+            if (auto CallTargets = Samples->findCallTargetMapAt(CallSite)) {
+              for (const auto &Target : CallTargets.get()) {
+                Function *Callee = SymbolMap.lookup(Target.first());
+                if (Callee && !Callee->isDeclaration())
+                  Entry.second->addCalledFunction(nullptr, (*CG)[Callee]);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Compute a top-down order the profile which is used to sort functions in
+  // one SCC later. The static processing order computed for an SCC may not
+  // reflect the call contexts in the context-sensitive profile, thus may cause
+  // potential inlining to be overlooked. The function order in one SCC is being
+  // adjusted to a top-down order based on the profile to favor more inlining.
+  DenseMap<Function *, uint64_t> ProfileOrderMap;
+  if (UseProfileTopDownOrder ||
+      (ProfileIsCS && !UseProfileTopDownOrder.getNumOccurrences())) {
+    // Create a static call graph. The call edges are not important since they
+    // will be replaced by dynamic edges from the profile.
+    CallGraph ProfileCG(M);
+    replaceCallGraphEdges(ProfileCG, SymbolMap);
+    scc_iterator<CallGraph *> CGI = scc_begin(&ProfileCG);
+    uint64_t I = 0;
+    while (!CGI.isAtEnd()) {
+      for (CallGraphNode *Node : *CGI) {
+        if (auto *F = Node->getFunction())
+          ProfileOrderMap[F] = ++I;
+      }
+      ++CGI;
+    }
+  }
+
   scc_iterator<CallGraph *> CGI = scc_begin(CG);
   while (!CGI.isAtEnd()) {
-    for (CallGraphNode *node : *CGI) {
-      auto F = node->getFunction();
+    uint64_t Start = FunctionOrderList.size();
+    for (CallGraphNode *Node : *CGI) {
+      auto *F = Node->getFunction();
       if (F && !F->isDeclaration() && F->hasFnAttribute("use-sample-profile"))
         FunctionOrderList.push_back(F);
     }
+
+    // Sort nodes in SCC based on the profile top-down order.
+    if (!ProfileOrderMap.empty()) {
+      std::stable_sort(FunctionOrderList.begin() + Start,
+                       FunctionOrderList.end(),
+                       [&ProfileOrderMap](Function *Left, Function *Right) {
+                         return ProfileOrderMap[Left] < ProfileOrderMap[Right];
+                       });
+    }
+
     ++CGI;
   }
 
+  LLVM_DEBUG({
+    dbgs() << "Function processing order:\n";
+    for (auto F : reverse(FunctionOrderList)) {
+      dbgs() << F->getName() << "\n";
+    }
+  });
+
   std::reverse(FunctionOrderList.begin(), FunctionOrderList.end());
   return FunctionOrderList;
 }
@@ -2461,6 +2593,7 @@ bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) {
 }
 
 bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) {
+  LLVM_DEBUG(dbgs() << "\n\nProcessing Function " << F.getName() << "\n");
   DILocation2SampleMap.clear();
   // By default the entry count is initialized to -1, which will be treated
   // conservatively by getEntryCount as the same as unknown (None). This is
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/profile-context-order.prof b/llvm/test/Transforms/SampleProfile/Inputs/profile-context-order.prof
new file mode 100644
index 000000000000..f941b5053ee6
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/Inputs/profile-context-order.prof
@@ -0,0 +1,38 @@
+[main:3 @ _Z5funcAi:1 @ _Z8funcLeafi]:1467299:11
+ 0: 6
+ 1: 6
+ 3: 287884
+ 15: 23
+[main:3.1 @ _Z5funcBi:1 @ _Z8funcLeafi]:500853:20
+ 0: 15
+ 1: 15
+ 3: 74946
+ 10: 23324
+ 15: 11
+[main]:154:0
+ 2: 12
+ 3: 18 _Z5funcAi:11
+ 3.1: 18 _Z5funcBi:19
+[external:12 @ main]:154:12
+ 2: 12
+ 3: 10 _Z5funcAi:7
+ 3.1: 10 _Z5funcBi:11
+[main:3.1 @ _Z5funcBi]:120:19
+ 0: 19
+ 1: 19 _Z8funcLeafi:20
+ 3: 12
+[externalA:17 @ _Z5funcBi]:120:3
+ 0: 3
+ 1: 3
+[external:10 @ _Z5funcBi]:120:10
+ 0: 10
+ 1: 10
+[main:3 @ _Z5funcAi]:99:11
+ 0: 10
+ 1: 10 _Z8funcLeafi:11
+ 2: 287864 _Z3fibi:315608
+ 3: 24
+[main:3 @ _Z5funcAi:2 @ _Z3fibi]:287864:315608
+ 0: 362839
+ 1: 6
+ 3: 287884
\ No newline at end of file
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/profile-topdown-order.prof b/llvm/test/Transforms/SampleProfile/Inputs/profile-topdown-order.prof
new file mode 100644
index 000000000000..fa8be3305de0
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/Inputs/profile-topdown-order.prof
@@ -0,0 +1,36 @@
+_Z8funcLeafi:500853:20
+ 0: 15
+ 1: 15
+ 3: 74946
+ 10: 23324
+ 15: 11
+main:154:0
+ 2: 12
+ 3: 18 _Z5funcAi:11
+ 3.1: 18 _Z5funcBi:19
+main:154:12
+ 2: 12
+ 3: 10 _Z5funcAi:7
+ 3.1: 10 _Z5funcBi:11
+_Z5funcBi:120:19
+ 0: 19
+ 1: 19 _Z8funcLeafi:20
+ 3: 12
+_Z5funcBi:120:3
+ 0: 3
+ 1: 3
+_Z5funcBi:120:10
+ 0: 10
+ 1: 10
+_Z5funcAi:99:11
+ 0: 10
+ 1: _Z8funcLeafi:40
+   0: 6
+   1: 6
+   3: 2
+   15: 23
+ 2: 315608 _Z3fibi:362839
+   0: 315608
+   1: 6
+   3: 287884
+ 3: 24
diff --git a/llvm/test/Transforms/SampleProfile/profile-context-order.ll b/llvm/test/Transforms/SampleProfile/profile-context-order.ll
new file mode 100644
index 000000000000..a75dcc2179ca
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/profile-context-order.ll
@@ -0,0 +1,190 @@
+;; Test for different function processing orders affecting inlining in sample profile loader.
+
+;; There is an SCC _Z5funcAi -> _Z8funcLeafi -> _Z5funcAi in the program.
+;; With -use-profile-top-down-order=0, the top-down processing order of
+;; that SCC is (_Z8funcLeafi, _Z5funcAi), which is determinined based on
+;; the static call graph. With -use-profile-top-down-order=1, call edges
+;; from profile are considered, thus the order becomes (_Z5funcAi, _Z8funcLeafi)
+;; which leads to _Z8funcLeafi inlined into _Z5funcAi.
+; RUN: opt < %s -passes=sample-profile -use-profile-top-down-order=1 -sample-profile-file=%S/Inputs/profile-context-order.prof -S | FileCheck %s -check-prefix=INLINE
+; RUN: opt < %s -passes=sample-profile -use-profile-top-down-order=0 -sample-profile-file=%S/Inputs/profile-context-order.prof -S | FileCheck %s -check-prefix=NOINLINE
+
+;; There is an indirect call _Z5funcAi -> _Z3fibi in the program.
+;; With -use-profile-indirect-call-edges=0, the processing order computed
+;; based on the static call graph is (_Z3fibi, _Z5funcAi). With 
+;; -use-profile-top-down-order=1, the indirect call edge from profile is
+;; considered, thus the order becomes (_Z5funcAi, _Z3fibi) which leads to
+;; _Z3fibi inlined into _Z5funcAi.
+; RUN: opt < %s -passes=sample-profile -use-profile-indirect-call-edges=1 -sample-profile-file=%S/Inputs/profile-context-order.prof -S | FileCheck %s -check-prefix=ICALL-INLINE
+; RUN: opt < %s -passes=sample-profile -use-profile-indirect-call-edges=0 -sample-profile-file=%S/Inputs/profile-context-order.prof -S | FileCheck %s -check-prefix=ICALL-NOINLINE
+
+@factor = dso_local global i32 3, align 4, !dbg !0
+@fp = dso_local global i32 (i32)* null, align 8
+
+define dso_local i32 @main() local_unnamed_addr #0 !dbg !18 {
+entry:
+  store i32 (i32)* @_Z3fibi, i32 (i32)** @fp, align 8, !dbg !25
+  br label %for.body, !dbg !25
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret i32 %add3, !dbg !27
+
+for.body:                                         ; preds = %for.body, %entry
+  %x.011 = phi i32 [ 300000, %entry ], [ %dec, %for.body ]
+  %r.010 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
+  %call = tail call i32 @_Z5funcBi(i32 %x.011), !dbg !32
+  %add = add nuw nsw i32 %x.011, 1, !dbg !31
+  %call1 = tail call i32 @_Z5funcAi(i32 %add), !dbg !28
+  %add2 = add i32 %call, %r.010, !dbg !34
+  %add3 = add i32 %add2, %call1, !dbg !35
+  %dec = add nsw i32 %x.011, -1, !dbg !36
+  %cmp = icmp eq i32 %x.011, 0, !dbg !38
+  br i1 %cmp, label %for.cond.cleanup, label %for.body, !dbg !25
+}
+
+; INLINE: define dso_local i32 @_Z5funcAi
+; INLINE-NOT: call i32 @_Z8funcLeafi
+; NOINLINE: define dso_local i32 @_Z5funcAi
+; NOINLINE: call i32 @_Z8funcLeafi
+; ICALL-INLINE: define dso_local i32 @_Z5funcAi
+; ICALL-INLINE: call i32 @_Z3foo
+; ICALL-NOINLINE: define dso_local i32 @_Z5funcAi
+; ICALL-NOINLINE-NO: call i32 @_Z3foo
+; ICALL-NOINLINE-NO: call i32 @_Z3fibi
+define dso_local i32 @_Z5funcAi(i32 %x) local_unnamed_addr #0 !dbg !40 {
+entry:
+  %add = add nsw i32 %x, 100000, !dbg !44
+  %0 = load i32 (i32)*, i32 (i32)** @fp, align 8
+  %call = call i32 %0(i32 8), !dbg !45
+  %call1 = tail call i32 @_Z8funcLeafi(i32 %add), !dbg !46
+  ret i32 %call, !dbg !46
+}
+
+; INLINE: define dso_local i32 @_Z8funcLeafi
+; NOINLINE: define dso_local i32 @_Z8funcLeafi
+; ICALL-INLINE: define dso_local i32 @_Z8funcLeafi
+; ICALL-NOINLINE: define dso_local i32 @_Z8funcLeafi
+define dso_local i32 @_Z8funcLeafi(i32 %x) local_unnamed_addr #1 !dbg !54 {
+entry:
+  %cmp = icmp sgt i32 %x, 0, !dbg !57
+  br i1 %cmp, label %while.body, label %while.cond2.preheader, !dbg !59
+
+while.cond2.preheader:                            ; preds = %entry
+  %cmp313 = icmp slt i32 %x, 0, !dbg !60
+  br i1 %cmp313, label %while.body4, label %if.end, !dbg !63
+
+while.body:                                       ; preds = %while.body, %entry
+  %x.addr.016 = phi i32 [ %sub, %while.body ], [ %x, %entry ]
+  %tmp = load volatile i32, i32* @factor, align 4, !dbg !64
+  %call = tail call i32 @_Z5funcAi(i32 %tmp), !dbg !67
+  %sub = sub nsw i32 %x.addr.016, %call, !dbg !68
+  %cmp1 = icmp sgt i32 %sub, 0, !dbg !69
+  br i1 %cmp1, label %while.body, label %if.end, !dbg !71
+
+while.body4:                                      ; preds = %while.body4, %while.cond2.preheader
+  %x.addr.114 = phi i32 [ %add, %while.body4 ], [ %x, %while.cond2.preheader ]
+  %tmp1 = load volatile i32, i32* @factor, align 4, !dbg !72
+  %call5 = tail call i32 @_Z5funcBi(i32 %tmp1), !dbg !74
+  %add = add nsw i32 %call5, %x.addr.114, !dbg !75
+  %cmp3 = icmp slt i32 %add, 0, !dbg !60
+  br i1 %cmp3, label %while.body4, label %if.end, !dbg !63
+
+if.end:                                           ; preds = %while.body4, %while.body, %while.cond2.preheader
+  %x.addr.2 = phi i32 [ 0, %while.cond2.preheader ], [ %sub, %while.body ], [ %add, %while.body4 ]
+  ret i32 %x.addr.2, !dbg !76
+}
+
+define dso_local i32 @_Z5funcBi(i32 %x) local_unnamed_addr #0 !dbg !47 {
+entry:
+  %sub = add nsw i32 %x, -100000, !dbg !51
+  %call = tail call i32 @_Z8funcLeafi(i32 %sub), !dbg !52
+  ret i32 %call, !dbg !53
+}
+
+define dso_local i32 @_Z3fibi(i32 %x) local_unnamed_addr #1 !dbg !77 {
+entry:
+  %sub = add nsw i32 %x, -100000, !dbg !78
+  %call = tail call i32 @_Z3foo(i32 %sub), !dbg !78
+  ret i32 %sub, !dbg !78
+}
+
+declare i32 @_Z3foo(i32)
+
+attributes #0 = { nofree noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #1 = { nofree nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!14, !15, !16}
+!llvm.ident = !{!17}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "factor", scope: !2, file: !3, line: 21, type: !13, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !5, globals: !12, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!3 = !DIFile(filename: "merged.cpp", directory: "/local/autofdo")
+!4 = !{}
+!5 = !{!6, !10, !11}
+!6 = !DISubprogram(name: "funcA", linkageName: "_Z5funcAi", scope: !3, file: !3, line: 6, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
+!7 = !DISubroutineType(types: !8)
+!8 = !{!9, !9}
+!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!10 = !DISubprogram(name: "funcB", linkageName: "_Z5funcBi", scope: !3, file: !3, line: 7, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
+!11 = !DISubprogram(name: "funcLeaf", linkageName: "_Z8funcLeafi", scope: !3, file: !3, line: 22, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
+!12 = !{!0}
+!13 = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: !9)
+!14 = !{i32 7, !"Dwarf Version", i32 4}
+!15 = !{i32 2, !"Debug Info Version", i32 3}
+!16 = !{i32 1, !"wchar_size", i32 4}
+!17 = !{!"clang version 11.0.0"}
+!18 = distinct !DISubprogram(name: "main", scope: !3, file: !3, line: 11, type: !19, scopeLine: 11, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !21)
+!19 = !DISubroutineType(types: !20)
+!20 = !{!9}
+!21 = !{!22, !23}
+!22 = !DILocalVariable(name: "r", scope: !18, file: !3, line: 12, type: !9)
+!23 = !DILocalVariable(name: "x", scope: !24, file: !3, line: 13, type: !9)
+!24 = distinct !DILexicalBlock(scope: !18, file: !3, line: 13, column: 3)
+!25 = !DILocation(line: 13, column: 3, scope: !26)
+!26 = !DILexicalBlockFile(scope: !24, file: !3, discriminator: 2)
+!27 = !DILocation(line: 17, column: 3, scope: !18)
+!28 = !DILocation(line: 14, column: 10, scope: !29)
+!29 = distinct !DILexicalBlock(scope: !30, file: !3, line: 13, column: 37)
+!30 = distinct !DILexicalBlock(scope: !24, file: !3, line: 13, column: 3)
+!31 = !DILocation(line: 14, column: 29, scope: !29)
+!32 = !DILocation(line: 14, column: 21, scope: !33)
+!33 = !DILexicalBlockFile(scope: !29, file: !3, discriminator: 2)
+!34 = !DILocation(line: 14, column: 19, scope: !29)
+!35 = !DILocation(line: 14, column: 7, scope: !29)
+!36 = !DILocation(line: 13, column: 33, scope: !37)
+!37 = !DILexicalBlockFile(scope: !30, file: !3, discriminator: 6)
+!38 = !DILocation(line: 13, column: 26, scope: !39)
+!39 = !DILexicalBlockFile(scope: !30, file: !3, discriminator: 2)
+!40 = distinct !DISubprogram(name: "funcA", linkageName: "_Z5funcAi", scope: !3, file: !3, line: 26, type: !7, scopeLine: 26, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!44 = !DILocation(line: 26, column: 22, scope: !40)
+!45 = !DILocation(line: 28, column: 11, scope: !40)
+!46 = !DILocation(line: 27, column: 3, scope: !40)
+!47 = distinct !DISubprogram(name: "funcB", linkageName: "_Z5funcBi", scope: !3, file: !3, line: 32, type: !7, scopeLine: 32, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!51 = !DILocation(line: 33, column: 22, scope: !47)
+!52 = !DILocation(line: 33, column: 11, scope: !47)
+!53 = !DILocation(line: 35, column: 3, scope: !47)
+!54 = distinct !DISubprogram(name: "funcLeaf", linkageName: "_Z8funcLeafi", scope: !3, file: !3, line: 48, type: !7, scopeLine: 48, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!57 = !DILocation(line: 49, column: 9, scope: !58)
+!58 = distinct !DILexicalBlock(scope: !54, file: !3, line: 49, column: 7)
+!59 = !DILocation(line: 49, column: 7, scope: !54)
+!60 = !DILocation(line: 58, column: 14, scope: !61)
+!61 = !DILexicalBlockFile(scope: !62, file: !3, discriminator: 2)
+!62 = distinct !DILexicalBlock(scope: !58, file: !3, line: 56, column: 8)
+!63 = !DILocation(line: 58, column: 5, scope: !61)
+!64 = !DILocation(line: 52, column: 16, scope: !65)
+!65 = distinct !DILexicalBlock(scope: !66, file: !3, line: 51, column: 19)
+!66 = distinct !DILexicalBlock(scope: !58, file: !3, line: 49, column: 14)
+!67 = !DILocation(line: 52, column: 12, scope: !65)
+!68 = !DILocation(line: 52, column: 9, scope: !65)
+!69 = !DILocation(line: 51, column: 14, scope: !70)
+!70 = !DILexicalBlockFile(scope: !66, file: !3, discriminator: 2)
+!71 = !DILocation(line: 51, column: 5, scope: !70)
+!72 = !DILocation(line: 59, column: 16, scope: !73)
+!73 = distinct !DILexicalBlock(scope: !62, file: !3, line: 58, column: 19)
+!74 = !DILocation(line: 59, column: 12, scope: !73)
+!75 = !DILocation(line: 59, column: 9, scope: !73)
+!76 = !DILocation(line: 63, column: 3, scope: !54)
+!77 = distinct !DISubprogram(name: "funcB", linkageName: "_Z3fibi", scope: !3, file: !3, line: 32, type: !7, scopeLine: 32, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!78 = !DILocation(line: 33, column: 22, scope: !77)
diff --git a/llvm/test/Transforms/SampleProfile/profile-topdown-order.ll b/llvm/test/Transforms/SampleProfile/profile-topdown-order.ll
new file mode 100644
index 000000000000..3a343fa3cc8a
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/profile-topdown-order.ll
@@ -0,0 +1,179 @@
+;; Test for different function processing orders affecting inlining in sample profile loader.
+
+;; There is an SCC _Z5funcAi -> _Z8funcLeafi -> _Z5funcAi in the program.
+;; With -use-profile-top-down-order=0, the top-down processing order of
+;; that SCC is (_Z8funcLeafi, _Z5funcAi), which is determinined based on
+;; the static call graph. With -use-profile-top-down-order=1, call edges
+;; from profile are considered, thus the order becomes (_Z5funcAi, _Z8funcLeafi).
+;; While _Z8funcLeafi is not supposed to be inlined, the outlined entry counts
+;; are affected.
+; RUN: opt < %s -passes=sample-profile -use-profile-top-down-order=0 -sample-profile-file=%S/Inputs/profile-topdown-order.prof -S | FileCheck %s -check-prefix=STATIC
+; RUN: opt < %s -passes=sample-profile -use-profile-top-down-order=1 -sample-profile-file=%S/Inputs/profile-topdown-order.prof -S | FileCheck %s -check-prefix=DYNAMIC
+
+
+; STATIC:  define dso_local i32 @_Z8funcLeafi{{.*}} !prof ![[#PROF:]]
+; STATIC:  ![[#PROF]] = !{!"function_entry_count", i64 21}
+; DYNAMIC: define dso_local i32 @_Z8funcLeafi{{.*}} !prof ![[#PROF:]]
+; DYNAMIC: ![[#PROF]] = !{!"function_entry_count", i64 27}
+
+@factor = dso_local global i32 3, align 4, !dbg !0
+@fp = dso_local global i32 (i32)* null, align 8
+
+define dso_local i32 @main() local_unnamed_addr #0 !dbg !18 {
+entry:
+  store i32 (i32)* @_Z3fibi, i32 (i32)** @fp, align 8, !dbg !25
+  br label %for.body, !dbg !25
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret i32 %add3, !dbg !27
+
+for.body:                                         ; preds = %for.body, %entry
+  %x.011 = phi i32 [ 300000, %entry ], [ %dec, %for.body ]
+  %r.010 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
+  %call = tail call i32 @_Z5funcBi(i32 %x.011), !dbg !32
+  %add = add nuw nsw i32 %x.011, 1, !dbg !31
+  %call1 = tail call i32 @_Z5funcAi(i32 %add), !dbg !28
+  %add2 = add i32 %call, %r.010, !dbg !34
+  %add3 = add i32 %add2, %call1, !dbg !35
+  %dec = add nsw i32 %x.011, -1, !dbg !36
+  %cmp = icmp eq i32 %x.011, 0, !dbg !38
+  br i1 %cmp, label %for.cond.cleanup, label %for.body, !dbg !25
+}
+
+define dso_local i32 @_Z5funcAi(i32 %x) local_unnamed_addr #0 !dbg !40 {
+entry:
+  %add = add nsw i32 %x, 100000, !dbg !44
+  %0 = load i32 (i32)*, i32 (i32)** @fp, align 8
+  %call = call i32 %0(i32 8), !dbg !45
+  %call1 = tail call i32 @_Z8funcLeafi(i32 %add), !dbg !46
+  ret i32 %call, !dbg !46
+}
+
+; INLINE: define dso_local i32 @_Z8funcLeafi
+; NOINLINE: define dso_local i32 @_Z8funcLeafi
+; ICALL-INLINE: define dso_local i32 @_Z8funcLeafi
+; ICALL-NOINLINE: define dso_local i32 @_Z8funcLeafi
+define dso_local i32 @_Z8funcLeafi(i32 %x) local_unnamed_addr #1 !dbg !54 {
+entry:
+  %cmp = icmp sgt i32 %x, 0, !dbg !57
+  br i1 %cmp, label %while.body, label %while.cond2.preheader, !dbg !59
+
+while.cond2.preheader:                            ; preds = %entry
+  %cmp313 = icmp slt i32 %x, 0, !dbg !60
+  br i1 %cmp313, label %while.body4, label %if.end, !dbg !63
+
+while.body:                                       ; preds = %while.body, %entry
+  %x.addr.016 = phi i32 [ %sub, %while.body ], [ %x, %entry ]
+  %tmp = load volatile i32, i32* @factor, align 4, !dbg !64
+  %call = tail call i32 @_Z5funcAi(i32 %tmp), !dbg !67
+  %sub = sub nsw i32 %x.addr.016, %call, !dbg !68
+  %cmp1 = icmp sgt i32 %sub, 0, !dbg !69
+  br i1 %cmp1, label %while.body, label %if.end, !dbg !71
+
+while.body4:                                      ; preds = %while.body4, %while.cond2.preheader
+  %x.addr.114 = phi i32 [ %add, %while.body4 ], [ %x, %while.cond2.preheader ]
+  %tmp1 = load volatile i32, i32* @factor, align 4, !dbg !72
+  %call5 = tail call i32 @_Z5funcBi(i32 %tmp1), !dbg !74
+  %add = add nsw i32 %call5, %x.addr.114, !dbg !75
+  %cmp3 = icmp slt i32 %add, 0, !dbg !60
+  br i1 %cmp3, label %while.body4, label %if.end, !dbg !63
+
+if.end:                                           ; preds = %while.body4, %while.body, %while.cond2.preheader
+  %x.addr.2 = phi i32 [ 0, %while.cond2.preheader ], [ %sub, %while.body ], [ %add, %while.body4 ]
+  ret i32 %x.addr.2, !dbg !76
+}
+
+define dso_local i32 @_Z5funcBi(i32 %x) local_unnamed_addr #0 !dbg !47 {
+entry:
+  %sub = add nsw i32 %x, -100000, !dbg !51
+  %call = tail call i32 @_Z8funcLeafi(i32 %sub), !dbg !52
+  ret i32 %call, !dbg !53
+}
+
+define dso_local i32 @_Z3fibi(i32 %x) local_unnamed_addr #1 !dbg !77 {
+entry:
+  %sub = add nsw i32 %x, -100000, !dbg !78
+  %call = tail call i32 @_Z3foo(i32 %sub), !dbg !78
+  ret i32 %sub, !dbg !78
+}
+
+declare i32 @_Z3foo(i32)
+
+attributes #0 = { nofree noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #1 = { nofree nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!14, !15, !16}
+!llvm.ident = !{!17}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "factor", scope: !2, file: !3, line: 21, type: !13, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !5, globals: !12, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!3 = !DIFile(filename: "merged.cpp", directory: "/local/autofdo")
+!4 = !{}
+!5 = !{!6, !10, !11}
+!6 = !DISubprogram(name: "funcA", linkageName: "_Z5funcAi", scope: !3, file: !3, line: 6, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
+!7 = !DISubroutineType(types: !8)
+!8 = !{!9, !9}
+!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!10 = !DISubprogram(name: "funcB", linkageName: "_Z5funcBi", scope: !3, file: !3, line: 7, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
+!11 = !DISubprogram(name: "funcLeaf", linkageName: "_Z8funcLeafi", scope: !3, file: !3, line: 22, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
+!12 = !{!0}
+!13 = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: !9)
+!14 = !{i32 7, !"Dwarf Version", i32 4}
+!15 = !{i32 2, !"Debug Info Version", i32 3}
+!16 = !{i32 1, !"wchar_size", i32 4}
+!17 = !{!"clang version 11.0.0"}
+!18 = distinct !DISubprogram(name: "main", scope: !3, file: !3, line: 11, type: !19, scopeLine: 11, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !21)
+!19 = !DISubroutineType(types: !20)
+!20 = !{!9}
+!21 = !{!22, !23}
+!22 = !DILocalVariable(name: "r", scope: !18, file: !3, line: 12, type: !9)
+!23 = !DILocalVariable(name: "x", scope: !24, file: !3, line: 13, type: !9)
+!24 = distinct !DILexicalBlock(scope: !18, file: !3, line: 13, column: 3)
+!25 = !DILocation(line: 13, column: 3, scope: !26)
+!26 = !DILexicalBlockFile(scope: !24, file: !3, discriminator: 2)
+!27 = !DILocation(line: 17, column: 3, scope: !18)
+!28 = !DILocation(line: 14, column: 10, scope: !29)
+!29 = distinct !DILexicalBlock(scope: !30, file: !3, line: 13, column: 37)
+!30 = distinct !DILexicalBlock(scope: !24, file: !3, line: 13, column: 3)
+!31 = !DILocation(line: 14, column: 29, scope: !29)
+!32 = !DILocation(line: 14, column: 21, scope: !33)
+!33 = !DILexicalBlockFile(scope: !29, file: !3, discriminator: 2)
+!34 = !DILocation(line: 14, column: 19, scope: !29)
+!35 = !DILocation(line: 14, column: 7, scope: !29)
+!36 = !DILocation(line: 13, column: 33, scope: !37)
+!37 = !DILexicalBlockFile(scope: !30, file: !3, discriminator: 6)
+!38 = !DILocation(line: 13, column: 26, scope: !39)
+!39 = !DILexicalBlockFile(scope: !30, file: !3, discriminator: 2)
+!40 = distinct !DISubprogram(name: "funcA", linkageName: "_Z5funcAi", scope: !3, file: !3, line: 26, type: !7, scopeLine: 26, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!44 = !DILocation(line: 26, column: 22, scope: !40)
+!45 = !DILocation(line: 28, column: 11, scope: !40)
+!46 = !DILocation(line: 27, column: 3, scope: !40)
+!47 = distinct !DISubprogram(name: "funcB", linkageName: "_Z5funcBi", scope: !3, file: !3, line: 32, type: !7, scopeLine: 32, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!51 = !DILocation(line: 33, column: 22, scope: !47)
+!52 = !DILocation(line: 33, column: 11, scope: !47)
+!53 = !DILocation(line: 35, column: 3, scope: !47)
+!54 = distinct !DISubprogram(name: "funcLeaf", linkageName: "_Z8funcLeafi", scope: !3, file: !3, line: 48, type: !7, scopeLine: 48, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!57 = !DILocation(line: 49, column: 9, scope: !58)
+!58 = distinct !DILexicalBlock(scope: !54, file: !3, line: 49, column: 7)
+!59 = !DILocation(line: 49, column: 7, scope: !54)
+!60 = !DILocation(line: 58, column: 14, scope: !61)
+!61 = !DILexicalBlockFile(scope: !62, file: !3, discriminator: 2)
+!62 = distinct !DILexicalBlock(scope: !58, file: !3, line: 56, column: 8)
+!63 = !DILocation(line: 58, column: 5, scope: !61)
+!64 = !DILocation(line: 52, column: 16, scope: !65)
+!65 = distinct !DILexicalBlock(scope: !66, file: !3, line: 51, column: 19)
+!66 = distinct !DILexicalBlock(scope: !58, file: !3, line: 49, column: 14)
+!67 = !DILocation(line: 52, column: 12, scope: !65)
+!68 = !DILocation(line: 52, column: 9, scope: !65)
+!69 = !DILocation(line: 51, column: 14, scope: !70)
+!70 = !DILexicalBlockFile(scope: !66, file: !3, discriminator: 2)
+!71 = !DILocation(line: 51, column: 5, scope: !70)
+!72 = !DILocation(line: 59, column: 16, scope: !73)
+!73 = distinct !DILexicalBlock(scope: !62, file: !3, line: 58, column: 19)
+!74 = !DILocation(line: 59, column: 12, scope: !73)
+!75 = !DILocation(line: 59, column: 9, scope: !73)
+!76 = !DILocation(line: 63, column: 3, scope: !54)
+!77 = distinct !DISubprogram(name: "funcB", linkageName: "_Z3fibi", scope: !3, file: !3, line: 32, type: !7, scopeLine: 32, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!78 = !DILocation(line: 33, column: 22, scope: !77)

From 989b5c9571922ddfecae78a1351d0c801bfbf97b Mon Sep 17 00:00:00 2001
From: Hongtao Yu <hoy@fb.com>
Date: Thu, 11 Feb 2021 14:51:47 -0800
Subject: [PATCH 146/318] Remove test code that cause MSAN failure.

Summary:
The negative test (with the feature being added disabled) caused MSAN failure and that's the added feature is supposed to fix. Therefore the negative test code is being removed.
---
 llvm/test/Transforms/SampleProfile/profile-context-order.ll | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/llvm/test/Transforms/SampleProfile/profile-context-order.ll b/llvm/test/Transforms/SampleProfile/profile-context-order.ll
index a75dcc2179ca..c99cc15850b7 100644
--- a/llvm/test/Transforms/SampleProfile/profile-context-order.ll
+++ b/llvm/test/Transforms/SampleProfile/profile-context-order.ll
@@ -16,7 +16,6 @@
 ;; considered, thus the order becomes (_Z5funcAi, _Z3fibi) which leads to
 ;; _Z3fibi inlined into _Z5funcAi.
 ; RUN: opt < %s -passes=sample-profile -use-profile-indirect-call-edges=1 -sample-profile-file=%S/Inputs/profile-context-order.prof -S | FileCheck %s -check-prefix=ICALL-INLINE
-; RUN: opt < %s -passes=sample-profile -use-profile-indirect-call-edges=0 -sample-profile-file=%S/Inputs/profile-context-order.prof -S | FileCheck %s -check-prefix=ICALL-NOINLINE
 
 @factor = dso_local global i32 3, align 4, !dbg !0
 @fp = dso_local global i32 (i32)* null, align 8
@@ -48,9 +47,6 @@ for.body:                                         ; preds = %for.body, %entry
 ; NOINLINE: call i32 @_Z8funcLeafi
 ; ICALL-INLINE: define dso_local i32 @_Z5funcAi
 ; ICALL-INLINE: call i32 @_Z3foo
-; ICALL-NOINLINE: define dso_local i32 @_Z5funcAi
-; ICALL-NOINLINE-NO: call i32 @_Z3foo
-; ICALL-NOINLINE-NO: call i32 @_Z3fibi
 define dso_local i32 @_Z5funcAi(i32 %x) local_unnamed_addr #0 !dbg !40 {
 entry:
   %add = add nsw i32 %x, 100000, !dbg !44

From beb80ffee6a1a816cfeb4047926f412c1a2456d9 Mon Sep 17 00:00:00 2001
From: wlei <wlei@fb.com>
Date: Wed, 3 Feb 2021 14:13:06 -0800
Subject: [PATCH 147/318] [CSSPGO][llvm-profgen] Add brackets for context id to
 support extended binary format

To align with https://reviews.llvm.org/D95547, we need to add brackets for context id before initializing the `SampleContext`.

Also added test cases for extended binary format from llvm-profgen side.

Differential Revision: https://reviews.llvm.org/D95929
---
 llvm/lib/ProfileData/SampleProfWriter.cpp     |  5 +-
 .../test/tools/llvm-profgen/cs-extbinary.test | 14 +++++
 .../tools/llvm-profgen/inline-cs-noprobe.test | 11 ++--
 .../llvm-profgen/noinline-cs-noprobe.test     |  2 +
 .../recursion-compression-noprobe.test        | 11 ++--
 .../recursion-compression-pseudoprobe.test    | 63 ++++++++++---------
 llvm/tools/llvm-profgen/ProfileGenerator.cpp  | 58 ++++++++++-------
 llvm/tools/llvm-profgen/ProfileGenerator.h    | 12 ++--
 llvm/tools/llvm-profgen/ProfiledBinary.cpp    |  4 +-
 9 files changed, 104 insertions(+), 76 deletions(-)
 create mode 100644 llvm/test/tools/llvm-profgen/cs-extbinary.test

diff --git a/llvm/lib/ProfileData/SampleProfWriter.cpp b/llvm/lib/ProfileData/SampleProfWriter.cpp
index b388b78dfaca..8017f2a82804 100644
--- a/llvm/lib/ProfileData/SampleProfWriter.cpp
+++ b/llvm/lib/ProfileData/SampleProfWriter.cpp
@@ -360,10 +360,7 @@ std::error_code SampleProfileWriterCompactBinary::write(
 /// it needs to be parsed by the SampleProfileReaderText class.
 std::error_code SampleProfileWriterText::writeSample(const FunctionSamples &S) {
   auto &OS = *OutputStream;
-  if (FunctionSamples::ProfileIsCS)
-    OS << "[" << S.getNameWithContext() << "]:" << S.getTotalSamples();
-  else
-    OS << S.getName() << ":" << S.getTotalSamples();
+  OS << S.getNameWithContext(true) << ":" << S.getTotalSamples();
   if (Indent == 0)
     OS << ":" << S.getHeadSamples();
   OS << "\n";
diff --git a/llvm/test/tools/llvm-profgen/cs-extbinary.test b/llvm/test/tools/llvm-profgen/cs-extbinary.test
new file mode 100644
index 000000000000..8acce173d405
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/cs-extbinary.test
@@ -0,0 +1,14 @@
+; test for dwarf-based cs profile
+; RUN: llvm-profgen --format=extbinary --perfscript=%S/Inputs/recursion-compression-noprobe.perfscript --binary=%S/Inputs/recursion-compression-noprobe.perfbin --output=%t1 --csprof-cold-thres=0
+; RUN: llvm-profdata merge --sample --text --output=%t2 %t1
+; RUN: FileCheck %S/recursion-compression-noprobe.test --input-file %t2
+; RUN: llvm-profdata merge --sample --extbinary --output=%t3 %t2 && llvm-profdata merge --sample --text --output=%t4 %t3
+; RUN: diff -b %t2 %t4
+
+
+; test for probe-based cs profile
+; RUN: llvm-profgen --format=extbinary --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t5 --csprof-cold-thres=0
+; RUN: llvm-profdata merge --sample --text --output=%t6 %t5
+; RUN: FileCheck %S/recursion-compression-pseudoprobe.test --input-file %t6
+; RUN: llvm-profdata merge --sample --extbinary --output=%t7 %t6 && llvm-profdata merge --sample --text --output=%t8 %t7
+; RUN: diff -b %t6 %t8
diff --git a/llvm/test/tools/llvm-profgen/inline-cs-noprobe.test b/llvm/test/tools/llvm-profgen/inline-cs-noprobe.test
index 943832ebef10..d8cc1932f877 100644
--- a/llvm/test/tools/llvm-profgen/inline-cs-noprobe.test
+++ b/llvm/test/tools/llvm-profgen/inline-cs-noprobe.test
@@ -2,11 +2,11 @@
 ; RUN: FileCheck %s --input-file %t
 
 ; CHECK:[main:1 @ foo]:44:0
-; CHECK: 2.2: 14
+; CHECK: 2.1: 14
 ; CHECK: 3: 15
-; CHECK: 3.2: 14 bar:14
-; CHECK: 3.4: 1
-; CHECK:[main:1 @ foo:3.2 @ bar]:14:0
+; CHECK: 3.1: 14 bar:14
+; CHECK: 3.2: 1
+; CHECK:[main:1 @ foo:3.1 @ bar]:14:0
 ; CHECK: 1: 14
 
 ; CHECK-UNWINDER: Binary(inline-cs-noprobe.perfbin)'s Range Counter:
@@ -15,10 +15,9 @@
 ; CHECK-UNWINDER:   (67e, 69b): 1
 ; CHECK-UNWINDER:   (67e, 6ad): 13
 ; CHECK-UNWINDER:   (6bd, 6c8): 14
-; CHECK-UNWINDER: main:1 @ foo:3.2 @ bar
+; CHECK-UNWINDER: main:1 @ foo:3.1 @ bar
 ; CHECK-UNWINDER:   (6af, 6bb): 14
 
-
 ; CHECK-UNWINDER: Binary(inline-cs-noprobe.perfbin)'s Branch Counter:
 ; CHECK-UNWINDER: main:1 @ foo
 ; CHECK-UNWINDER:   (69b, 670): 1
diff --git a/llvm/test/tools/llvm-profgen/noinline-cs-noprobe.test b/llvm/test/tools/llvm-profgen/noinline-cs-noprobe.test
index 2e60883afa62..9d5c787e7f92 100644
--- a/llvm/test/tools/llvm-profgen/noinline-cs-noprobe.test
+++ b/llvm/test/tools/llvm-profgen/noinline-cs-noprobe.test
@@ -36,6 +36,8 @@
 
 
+
+
 ; original code:
 ; clang -O0 -g test.c -o a.out
 #include <stdio.h>
diff --git a/llvm/test/tools/llvm-profgen/recursion-compression-noprobe.test b/llvm/test/tools/llvm-profgen/recursion-compression-noprobe.test
index 43f495398bb0..03bab8407435 100644
--- a/llvm/test/tools/llvm-profgen/recursion-compression-noprobe.test
+++ b/llvm/test/tools/llvm-profgen/recursion-compression-noprobe.test
@@ -10,16 +10,17 @@
 ; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa:2 @ fb]:12:0
 ; CHECK-UNCOMPRESS: 1: 11
 ; CHECK-UNCOMPRESS: 2: 1 fa:1
-; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa:2 @ fb:2 @ fa]:3:0
-; CHECK-UNCOMPRESS: 1: 1
-; CHECK-UNCOMPRESS: 2: 2 fb:1
 ; CHECK-UNCOMPRESS:[main:1 @ foo]:3:0
 ; CHECK-UNCOMPRESS: 2: 1
 ; CHECK-UNCOMPRESS: 3: 2 fa:1
-; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa:2 @ fb:2 @ fa:2 @ fb:2 @ fa]:1:0
-; CHECK-UNCOMPRESS: 4: 1
+; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa:2 @ fb:2 @ fa]:3:0
+; CHECK-UNCOMPRESS: 1: 1
+; CHECK-UNCOMPRESS: 2: 2 fb:1
 ; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa:2 @ fb:2 @ fa:2 @ fb]:1:0
 ; CHECK-UNCOMPRESS: 2: 1 fa:1
+; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa:2 @ fb:2 @ fa:2 @ fb:2 @ fa]:1:0
+; CHECK-UNCOMPRESS: 4: 1
+
 
 ; CHECK: [main:1 @ foo:3 @ fa]:14:0
 ; CHECK:  1: 1
diff --git a/llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test b/llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test
index 0d4e7dbb1dd4..0936e5d615ca 100644
--- a/llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test
+++ b/llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test
@@ -4,11 +4,11 @@
 ; RUN: llvm-profgen --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --show-unwinder-output --csprof-cold-thres=0 | FileCheck %s --check-prefix=CHECK-UNWINDER
 ; RUN: FileCheck %s --input-file %t
 
-; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:6 @ fa:8 @ fa:7 @ fb:6 @ fa]:4:1
+; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:6 @ fa]:4:1
 ; CHECK-UNCOMPRESS:  1: 1
 ; CHECK-UNCOMPRESS:  3: 1
-; CHECK-UNCOMPRESS:  4: 1
-; CHECK-UNCOMPRESS:  7: 1 fb:1
+; CHECK-UNCOMPRESS:  5: 1
+; CHECK-UNCOMPRESS:  8: 1 fa:1
 ; CHECK-UNCOMPRESS:  !CFGChecksum: 120515930909
 ; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:6 @ fa:8 @ fa]:4:1
 ; CHECK-UNCOMPRESS:  1: 1
@@ -16,28 +16,13 @@
 ; CHECK-UNCOMPRESS:  4: 1
 ; CHECK-UNCOMPRESS:  7: 1 fb:1
 ; CHECK-UNCOMPRESS:  !CFGChecksum: 120515930909
-; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:6 @ fa]:4:1
+; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:6 @ fa:8 @ fa:7 @ fb:6 @ fa]:4:1
 ; CHECK-UNCOMPRESS:  1: 1
 ; CHECK-UNCOMPRESS:  3: 1
-; CHECK-UNCOMPRESS:  5: 1
-; CHECK-UNCOMPRESS:  8: 1 fa:1
+; CHECK-UNCOMPRESS:  4: 1
+; CHECK-UNCOMPRESS:  7: 1 fb:1
 ; CHECK-UNCOMPRESS:  !CFGChecksum: 120515930909
-; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:6 @ fa:8 @ fa:7 @ fb:6 @ fa:7 @ fb]:3:1
-; CHECK-UNCOMPRESS:  1: 1
-; CHECK-UNCOMPRESS:  3: 1
-; CHECK-UNCOMPRESS:  6: 1 fa:1
-; CHECK-UNCOMPRESS:  !CFGChecksum: 72617220756
-; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:6 @ fa:8 @ fa:7 @ fb]:3:1
-; CHECK-UNCOMPRESS:  1: 1
-; CHECK-UNCOMPRESS:  3: 1
-; CHECK-UNCOMPRESS:  6: 1 fa:1
-; CHECK-UNCOMPRESS:  !CFGChecksum: 72617220756
-; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb]:3:1
-; CHECK-UNCOMPRESS:  1: 1
-; CHECK-UNCOMPRESS:  3: 1
-; CHECK-UNCOMPRESS:  6: 1 fa:1
-; CHECK-UNCOMPRESS:  !CFGChecksum: 72617220756
-; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb]:3:1
+; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb]:3:1
 ; CHECK-UNCOMPRESS:  1: 1
 ; CHECK-UNCOMPRESS:  2: 1
 ; CHECK-UNCOMPRESS:  5: 1 fb:1
@@ -47,11 +32,26 @@
 ; CHECK-UNCOMPRESS:  2: 1
 ; CHECK-UNCOMPRESS:  5: 1 fb:1
 ; CHECK-UNCOMPRESS:  !CFGChecksum: 72617220756
-; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb]:3:1
+; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb]:3:1
 ; CHECK-UNCOMPRESS:  1: 1
 ; CHECK-UNCOMPRESS:  2: 1
 ; CHECK-UNCOMPRESS:  5: 1 fb:1
 ; CHECK-UNCOMPRESS:  !CFGChecksum: 72617220756
+; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb]:3:1
+; CHECK-UNCOMPRESS:  1: 1
+; CHECK-UNCOMPRESS:  3: 1
+; CHECK-UNCOMPRESS:  6: 1 fa:1
+; CHECK-UNCOMPRESS:  !CFGChecksum: 72617220756
+; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:6 @ fa:8 @ fa:7 @ fb]:3:1
+; CHECK-UNCOMPRESS:  1: 1
+; CHECK-UNCOMPRESS:  3: 1
+; CHECK-UNCOMPRESS:  6: 1 fa:1
+; CHECK-UNCOMPRESS:  !CFGChecksum: 72617220756
+; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:6 @ fa:8 @ fa:7 @ fb:6 @ fa:7 @ fb]:3:1
+; CHECK-UNCOMPRESS:  1: 1
+; CHECK-UNCOMPRESS:  3: 1
+; CHECK-UNCOMPRESS:  6: 1 fa:1
+; CHECK-UNCOMPRESS:  !CFGChecksum: 72617220756
 ; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:6 @ fa:8 @ fa:7 @ fb:6 @ fa:7 @ fb:6 @ fa]:2:1
 ; CHECK-UNCOMPRESS:  1: 1
 ; CHECK-UNCOMPRESS:  3: 1
@@ -74,30 +74,31 @@
 ; CHECK:  4: 1
 ; CHECK:  7: 1 fb:1
 ; CHECK:  !CFGChecksum: 120515930909
-; CHECK: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:6 @ fa:8 @ fa]:4:1
-; CHECK:  1: 1
-; CHECK:  3: 1
-; CHECK:  4: 1
-; CHECK:  7: 1 fb:1
-; CHECK:  !CFGChecksum: 120515930909
 ; CHECK: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:6 @ fa]:4:1
 ; CHECK:  1: 1
 ; CHECK:  3: 1
 ; CHECK:  5: 1
 ; CHECK:  8: 1 fa:1
 ; CHECK:  !CFGChecksum: 120515930909
-; CHECK: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:6 @ fa:8 @ fa:7 @ fb:6 @ fa:7 @ fb]:3:1
+; CHECK: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:6 @ fa:8 @ fa]:4:1
+; CHECK:  1: 1
+; CHECK:  3: 1
+; CHECK:  4: 1
+; CHECK:  7: 1 fb:1
+; CHECK:  !CFGChecksum: 120515930909
+; CHECK: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:6 @ fa:8 @ fa:7 @ fb]:3:1
 ; CHECK:  1: 1
 ; CHECK:  3: 1
 ; CHECK:  6: 1 fa:1
 ; CHECK:  !CFGChecksum: 72617220756
-; CHECK: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:6 @ fa:8 @ fa:7 @ fb]:3:1
+; CHECK: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:6 @ fa:8 @ fa:7 @ fb:6 @ fa:7 @ fb]:3:1
 ; CHECK:  1: 1
 ; CHECK:  3: 1
 ; CHECK:  6: 1 fa:1
 ; CHECK:  !CFGChecksum: 72617220756
 
 
+
 ; CHECK-UNWINDER: Binary(recursion-compression-pseudoprobe.perfbin)'s Range Counter:
 ; CHECK-UNWINDER: main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5
 ; CHECK-UNWINDER:   (7a0, 7a7): 1
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
index b2a8d60d5caf..0a7dddc06bfc 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -76,13 +76,16 @@ ProfileGenerator::create(const BinarySampleCounterMap &BinarySampleCounters,
   return ProfileGenerator;
 }
 
+void ProfileGenerator::write(std::unique_ptr<SampleProfileWriter> Writer,
+                             StringMap<FunctionSamples> &ProfileMap) {
+  Writer->write(ProfileMap);
+}
+
 void ProfileGenerator::write() {
   auto WriterOrErr = SampleProfileWriter::create(OutputFilename, OutputFormat);
   if (std::error_code EC = WriterOrErr.getError())
     exitWithError(EC, OutputFilename);
-  auto Writer = std::move(WriterOrErr.get());
-  mergeAndTrimColdProfile(ProfileMap);
-  Writer->write(ProfileMap);
+  write(std::move(WriterOrErr.get()), ProfileMap);
 }
 
 void ProfileGenerator::findDisjointRanges(RangeSample &DisjointRanges,
@@ -188,7 +191,6 @@ CSProfileGenerator::getFunctionProfileForContext(StringRef ContextStr) {
   if (Ret.second) {
     SampleContext FContext(Ret.first->first(), RawContext);
     FunctionSamples &FProfile = Ret.first->second;
-    FProfile.setName(FContext.getNameWithoutContext());
     FProfile.setContext(FContext);
   }
   return Ret.first->second;
@@ -268,16 +270,15 @@ void CSProfileGenerator::populateFunctionBoundarySamples(
                                            CalleeName, Count);
 
     // Record head sample for called target(callee)
-    // TODO: Cleanup ' @ '
-    std::string CalleeContextId =
-        getCallSite(LeafLoc) + " @ " + CalleeName.str();
+    std::ostringstream OCalleeCtxStr;
     if (ContextId.find(" @ ") != StringRef::npos) {
-      CalleeContextId =
-          ContextId.rsplit(" @ ").first.str() + " @ " + CalleeContextId;
+      OCalleeCtxStr << ContextId.rsplit(" @ ").first.str();
+      OCalleeCtxStr << " @ ";
     }
+    OCalleeCtxStr << getCallSite(LeafLoc) << " @ " << CalleeName.str();
 
     FunctionSamples &CalleeProfile =
-        getFunctionProfileForContext(CalleeContextId);
+        getFunctionProfileForContext(OCalleeCtxStr.str());
     assert(Count != 0 && "Unexpected zero weight branch");
     CalleeProfile.addHeadSamples(Count);
   }
@@ -334,8 +335,8 @@ void CSProfileGenerator::populateInferredFunctionSamples() {
       EstimatedCallCount = 1;
     CallerProfile.addCalledTargetSamples(
         CallerLeafFrameLoc.second.LineOffset,
-        CallerLeafFrameLoc.second.Discriminator, CalleeProfile.getName(),
-        EstimatedCallCount);
+        CallerLeafFrameLoc.second.Discriminator,
+        CalleeProfile.getContext().getNameWithoutContext(), EstimatedCallCount);
     CallerProfile.addBodySamples(CallerLeafFrameLoc.second.LineOffset,
                                  CallerLeafFrameLoc.second.Discriminator,
                                  EstimatedCallCount);
@@ -362,8 +363,8 @@ void CSProfileGenerator::mergeAndTrimColdProfile(
   // Remove the code profile from ProfileMap and merge them into BaseProileMap
   StringMap<FunctionSamples> BaseProfileMap;
   for (const auto &I : ToRemoveVec) {
-    auto Ret =
-        BaseProfileMap.try_emplace(I.second->getName(), FunctionSamples());
+    auto Ret = BaseProfileMap.try_emplace(
+        I.second->getContext().getNameWithoutContext(), FunctionSamples());
     FunctionSamples &BaseProfile = Ret.first->second;
     BaseProfile.merge(*I.second);
     ProfileMap.erase(I.first);
@@ -378,14 +379,27 @@ void CSProfileGenerator::mergeAndTrimColdProfile(
     // Merge the profile if the original profile exists, otherwise just insert
     // as a new profile
     FunctionSamples &OrigProfile = getFunctionProfileForContext(I.getKey());
-    StringRef TmpName = OrigProfile.getName();
     OrigProfile.merge(I.second);
-    // Should use the name ref from ProfileMap's key to avoid name being freed
-    // from BaseProfileMap
-    OrigProfile.setName(TmpName);
   }
 }
 
+void CSProfileGenerator::write(std::unique_ptr<SampleProfileWriter> Writer,
+                               StringMap<FunctionSamples> &ProfileMap) {
+  mergeAndTrimColdProfile(ProfileMap);
+  // Add bracket for context key to support different profile binary format
+  StringMap<FunctionSamples> CxtWithBracketPMap;
+  for (const auto &Item : ProfileMap) {
+    std::string ContextWithBracket = "[" + Item.first().str() + "]";
+    auto Ret = CxtWithBracketPMap.try_emplace(ContextWithBracket, Item.second);
+    assert(Ret.second && "Must be a unique context");
+    SampleContext FContext(Ret.first->first(), RawContext);
+    FunctionSamples &FProfile = Ret.first->second;
+    FProfile.setName(FContext.getNameWithContext(true));
+    FProfile.setContext(FContext);
+  }
+  Writer->write(CxtWithBracketPMap);
+}
+
 // Helper function to extract context prefix string stack
 // Extract context stack for reusing, leaf context stack will
 // be added compressed while looking up function profile
@@ -399,8 +413,7 @@ extractPrefixContextStack(SmallVectorImpl<std::string> &ContextStrStack,
 }
 
 void PseudoProbeCSProfileGenerator::generateProfile() {
-  // Enable CS and pseudo probe functionalities in SampleProf
-  FunctionSamples::ProfileIsCS = true;
+  // Enable pseudo probe functionalities in SampleProf
   FunctionSamples::ProfileIsProbeBased = true;
   for (const auto &BI : BinarySampleCounters) {
     ProfiledBinary *Binary = BI.first;
@@ -495,8 +508,9 @@ void PseudoProbeCSProfileGenerator::populateBodySamplesWithProbes(
         CallerProfile.setFunctionHash(InlinerDesc->FuncHash);
         CallerProfile.addBodySamples(CallerIndex, 0, Count);
         CallerProfile.addTotalSamples(Count);
-        CallerProfile.addCalledTargetSamples(CallerIndex, 0,
-                                             FunctionProfile.getName(), Count);
+        CallerProfile.addCalledTargetSamples(
+            CallerIndex, 0,
+            FunctionProfile.getContext().getNameWithoutContext(), Count);
       }
     }
   }
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.h b/llvm/tools/llvm-profgen/ProfileGenerator.h
index 9cb04c4de34d..ff014ed79ce1 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.h
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.h
@@ -28,11 +28,9 @@ class ProfileGenerator {
   create(const BinarySampleCounterMap &BinarySampleCounters,
          enum PerfScriptType SampleType);
   virtual void generateProfile() = 0;
-  // Merge and trim profile with cold context before serialization,
-  // only eligible for CS profile
-  virtual void
-  mergeAndTrimColdProfile(StringMap<FunctionSamples> &ProfileMap){};
   // Use SampleProfileWriter to serialize profile map
+  virtual void write(std::unique_ptr<SampleProfileWriter> Writer,
+                     StringMap<FunctionSamples> &ProfileMap);
   void write();
 
 protected:
@@ -68,8 +66,6 @@ class CSProfileGenerator : public ProfileGenerator {
 
 public:
   void generateProfile() override {
-    // Enable context-sensitive functionalities in SampleProf
-    FunctionSamples::ProfileIsCS = true;
     for (const auto &BI : BinarySampleCounters) {
       ProfiledBinary *Binary = BI.first;
       for (const auto &CI : BI.second) {
@@ -205,7 +201,9 @@ class CSProfileGenerator : public ProfileGenerator {
   FunctionSamples &getFunctionProfileForContext(StringRef ContextId);
   // Merge cold context profile whose total sample is below threshold
   // into base profile.
-  void mergeAndTrimColdProfile(StringMap<FunctionSamples> &ProfileMap) override;
+  void mergeAndTrimColdProfile(StringMap<FunctionSamples> &ProfileMap);
+  void write(std::unique_ptr<SampleProfileWriter> Writer,
+             StringMap<FunctionSamples> &ProfileMap) override;
 
 private:
   // Helper function for updating body sample for a leaf location in
diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.cpp b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
index 2c6cedf57649..e1549b14bf05 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.cpp
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
@@ -11,6 +11,7 @@
 #include "ProfileGenerator.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Demangle/Demangle.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -393,7 +394,8 @@ FrameLocationStack ProfiledBinary::symbolize(const InstructionPointer &IP,
     if (UseCanonicalFnName)
       FunctionName = FunctionSamples::getCanonicalFnName(FunctionName);
     LineLocation Line(CallerFrame.Line - CallerFrame.StartLine,
-                      CallerFrame.Discriminator);
+                      DILocation::getBaseDiscriminatorFromDiscriminator(
+                          CallerFrame.Discriminator));
     FrameLocation Callsite(FunctionName.str(), Line);
     CallStack.push_back(Callsite);
   }

From 66873fb695370f5bd333e327ec77e4710c7891c2 Mon Sep 17 00:00:00 2001
From: wlei <wlei@fb.com>
Date: Tue, 9 Feb 2021 16:41:44 -0800
Subject: [PATCH 148/318] [CSSPGO][llvm-profgen] Renovate perfscript check and
 command line input validation

This include some changes related with PerfReader's the input check and command line change:

1) It appears there might be thousands of leading MMAP-Event line in the perfscript for large workload. For this case, the 4k threshold is not eligible to determine it's a hybrid sample. This change renovated the `isHybridPerfScript` by going through the script without threshold limitation checking whether there is a non-empty call stack immediately followed by a LBR sample. It will stop once it find a valid one.

2) Added several input validations for the command line switches in PerfReader.

3) Changed the command line `show-disassembly` to `show-disassembly-only`, it will print to stdout and exit early which leave an empty output profile.

Reviewed By: hoy, wenlei

Differential Revision: https://reviews.llvm.org/D96387
---
 llvm/test/tools/llvm-profgen/disassemble.s    |  2 +-
 .../llvm-profgen/invalid-perfscript.test      |  9 +++
 .../llvm-profgen/pseudoprobe-decoding.test    |  2 +-
 llvm/test/tools/llvm-profgen/symbolize.ll     |  2 +-
 llvm/tools/llvm-profgen/PerfReader.cpp        | 68 ++++++++++++------
 llvm/tools/llvm-profgen/PerfReader.h          | 69 ++++++++++++++-----
 llvm/tools/llvm-profgen/ProfileGenerator.cpp  |  2 +
 llvm/tools/llvm-profgen/ProfiledBinary.cpp    | 26 ++++---
 llvm/tools/llvm-profgen/llvm-profgen.cpp      |  6 +-
 9 files changed, 131 insertions(+), 55 deletions(-)
 create mode 100644 llvm/test/tools/llvm-profgen/invalid-perfscript.test

diff --git a/llvm/test/tools/llvm-profgen/disassemble.s b/llvm/test/tools/llvm-profgen/disassemble.s
index fc85fbe967e0..be03b5a6892b 100644
--- a/llvm/test/tools/llvm-profgen/disassemble.s
+++ b/llvm/test/tools/llvm-profgen/disassemble.s
@@ -1,6 +1,6 @@
 # REQUIRES: x86-registered-target
 # RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t
-# RUN: llvm-profgen --binary=%t --perfscript=%s --output=%t1 -show-disassembly -x86-asm-syntax=intel | FileCheck %s --match-full-lines
+# RUN: llvm-profgen --binary=%t --perfscript=%s --output=%t1 -show-disassembly-only -x86-asm-syntax=intel | FileCheck %s --match-full-lines
 
 # CHECK: Disassembly of section .text [0x0, 0x66]:
 # CHECK: <foo1>:
diff --git a/llvm/test/tools/llvm-profgen/invalid-perfscript.test b/llvm/test/tools/llvm-profgen/invalid-perfscript.test
new file mode 100644
index 000000000000..d795f85b1ea3
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/invalid-perfscript.test
@@ -0,0 +1,9 @@
+; RUN: llvm-profgen --perfscript=%s --binary=%S/Inputs/noinline-cs-noprobe.perfbin --output=%t 2>%t1
+; RUN: FileCheck %s --input-file %t1
+
+	          4005dc
+	          400634
+	          400684
+	    7f68c5788793
+
+; XFAIL: *
diff --git a/llvm/test/tools/llvm-profgen/pseudoprobe-decoding.test b/llvm/test/tools/llvm-profgen/pseudoprobe-decoding.test
index 5feaa97032ab..1d93a06d8e42 100644
--- a/llvm/test/tools/llvm-profgen/pseudoprobe-decoding.test
+++ b/llvm/test/tools/llvm-profgen/pseudoprobe-decoding.test
@@ -1,4 +1,4 @@
-; RUN: llvm-profgen --perfscript=%s  --binary=%S/Inputs/inline-cs-pseudoprobe.perfbin --output=%t --show-pseudo-probe --show-disassembly | FileCheck %s
+; RUN: llvm-profgen --perfscript=%s  --binary=%S/Inputs/inline-cs-pseudoprobe.perfbin --output=%t --show-pseudo-probe --show-disassembly-only | FileCheck %s
 
 PERF_RECORD_MMAP2 2854748/2854748: [0x400000(0x1000) @ 0 00:1d 123291722 526021]: r-xp /home/inline-cs-pseudoprobe.perfbin
 
diff --git a/llvm/test/tools/llvm-profgen/symbolize.ll b/llvm/test/tools/llvm-profgen/symbolize.ll
index 2fbc59e3d00d..9a436dec4c20 100644
--- a/llvm/test/tools/llvm-profgen/symbolize.ll
+++ b/llvm/test/tools/llvm-profgen/symbolize.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: x86-registered-target
 ; RUN: llc -filetype=obj %s -o %t
-; RUN: llvm-profgen --binary=%t --perfscript=%s --output=%t1 --show-disassembly -x86-asm-syntax=intel --show-source-locations | FileCheck %s --match-full-lines
+; RUN: llvm-profgen --binary=%t --perfscript=%s --output=%t1 --show-disassembly-only -x86-asm-syntax=intel --show-source-locations | FileCheck %s --match-full-lines
 
 ; CHECK: Disassembly of section .text [0x0, 0x4a]:
 ; CHECK: <funcA>:
diff --git a/llvm/tools/llvm-profgen/PerfReader.cpp b/llvm/tools/llvm-profgen/PerfReader.cpp
index e59d8d93381b..2e0b71f38e6d 100644
--- a/llvm/tools/llvm-profgen/PerfReader.cpp
+++ b/llvm/tools/llvm-profgen/PerfReader.cpp
@@ -17,6 +17,9 @@ static cl::opt<bool> ShowUnwinderOutput("show-unwinder-output",
                                         cl::ZeroOrMore,
                                         cl::desc("Print unwinder output"));
 
+extern cl::opt<bool> ShowDisassemblyOnly;
+extern cl::opt<bool> ShowSourceLocations;
+
 namespace llvm {
 namespace sampleprof {
 
@@ -230,7 +233,44 @@ bool VirtualUnwinder::unwind(const HybridSample *Sample, uint64_t Repeat) {
   return true;
 }
 
-PerfReader::PerfReader(cl::list<std::string> &BinaryFilenames) {
+void PerfReader::validateCommandLine(
+    cl::list<std::string> &BinaryFilenames,
+    cl::list<std::string> &PerfTraceFilenames) {
+  // Allow the invalid perfscript if we only use to show binary disassembly
+  if (!ShowDisassemblyOnly) {
+    for (auto &File : PerfTraceFilenames) {
+      if (!llvm::sys::fs::exists(File)) {
+        std::string Msg = "Input perf script(" + File + ") doesn't exist!";
+        exitWithError(Msg);
+      }
+    }
+  }
+  if (BinaryFilenames.size() > 1) {
+    // TODO: remove this if everything is ready to support multiple binaries.
+    exitWithError(
+        "Currently only support one input binary, multiple binaries' "
+        "profile will be merged in one profile and make profile "
+        "summary info inaccurate. Please use `llvm-perfdata` to merge "
+        "profiles from multiple binaries.");
+  }
+  for (auto &Binary : BinaryFilenames) {
+    if (!llvm::sys::fs::exists(Binary)) {
+      std::string Msg = "Input binary(" + Binary + ") doesn't exist!";
+      exitWithError(Msg);
+    }
+  }
+  if (CSProfileGenerator::MaxCompressionSize < -1) {
+    exitWithError("Value of --compress-recursion should >= -1");
+  }
+  if (ShowSourceLocations && !ShowDisassemblyOnly) {
+    exitWithError("--show-source-locations should work together with "
+                  "--show-disassembly-only!");
+  }
+}
+
+PerfReader::PerfReader(cl::list<std::string> &BinaryFilenames,
+                       cl::list<std::string> &PerfTraceFilenames) {
+  validateCommandLine(BinaryFilenames, PerfTraceFilenames);
   // Load the binaries.
   for (auto Filename : BinaryFilenames)
     loadBinary(Filename, /*AllowNameConflict*/ false);
@@ -591,27 +631,13 @@ void PerfReader::parseAndAggregateTrace(StringRef Filename) {
 
 void PerfReader::checkAndSetPerfType(
     cl::list<std::string> &PerfTraceFilenames) {
-  bool HasHybridPerf = true;
   for (auto FileName : PerfTraceFilenames) {
-    if (!isHybridPerfScript(FileName)) {
-      HasHybridPerf = false;
-      break;
-    }
-  }
-
-  if (HasHybridPerf) {
-    PerfType = PERF_LBR_STACK;
-  } else {
-    // TODO: Support other type of perf script
-    PerfType = PERF_INVILID;
-  }
-
-  if (BinaryTable.size() > 1) {
-    // TODO: remove this if everything is ready to support multiple binaries.
-    exitWithError("Currently only support one input binary, multiple binaries' "
-                  "profile will be merged in one profile and make profile "
-                  "summary info inaccurate. Please use `perfdata` to merge "
-                  "profiles from multiple binaries.");
+    PerfScriptType Type = checkPerfScriptType(FileName);
+    if (Type == PERF_INVALID)
+      exitWithError("Invalid perf script input!");
+    if (PerfType != PERF_UNKNOWN && PerfType != Type)
+      exitWithError("Inconsistent sample among different perf scripts");
+    PerfType = Type;
   }
 }
 
diff --git a/llvm/tools/llvm-profgen/PerfReader.h b/llvm/tools/llvm-profgen/PerfReader.h
index 7eaa4b846259..b802c212eb46 100644
--- a/llvm/tools/llvm-profgen/PerfReader.h
+++ b/llvm/tools/llvm-profgen/PerfReader.h
@@ -59,9 +59,10 @@ class TraceStream {
 
 // The type of perfscript
 enum PerfScriptType {
-  PERF_INVILID = 0,
-  PERF_LBR = 1,       // Only LBR sample
-  PERF_LBR_STACK = 2, // Hybrid sample including call stack and LBR stack.
+  PERF_UNKNOWN = 0,
+  PERF_INVALID = 1,
+  PERF_LBR = 2,       // Only LBR sample
+  PERF_LBR_STACK = 3, // Hybrid sample including call stack and LBR stack.
 };
 
 // The parsed LBR sample entry.
@@ -502,19 +503,52 @@ using BinarySampleCounterMap =
 class PerfReader {
 
 public:
-  PerfReader(cl::list<std::string> &BinaryFilenames);
-
-  // Hybrid sample(call stack + LBRs) profile traces are seprated by double line
-  // break, search for that within the first 4k charactors to avoid going
-  // through the whole file.
-  static bool isHybridPerfScript(StringRef FileName) {
-    auto BufOrError = MemoryBuffer::getFileOrSTDIN(FileName, 4000);
-    if (!BufOrError)
-      exitWithError(BufOrError.getError(), FileName);
-    auto Buffer = std::move(BufOrError.get());
-    if (Buffer->getBuffer().find("\n\n") == StringRef::npos)
+  PerfReader(cl::list<std::string> &BinaryFilenames,
+             cl::list<std::string> &PerfTraceFilenames);
+
+  // A LBR sample is like:
+  // 0x5c6313f/0x5c63170/P/-/-/0  0x5c630e7/0x5c63130/P/-/-/0 ...
+  // A heuristic for fast detection by checking whether a
+  // leading "  0x" and the '/' exist.
+  static bool isLBRSample(StringRef Line) {
+    if (!Line.startswith(" 0x"))
       return false;
-    return true;
+    if (Line.find('/') != StringRef::npos)
+      return true;
+    return false;
+  }
+
+  // The raw hybird sample is like
+  // e.g.
+  // 	          4005dc    # call stack leaf
+  //	          400634
+  //	          400684    # call stack root
+  // 0x4005c8/0x4005dc/P/-/-/0   0x40062f/0x4005b0/P/-/-/0 ...
+  //          ... 0x4005c8/0x4005dc/P/-/-/0    # LBR Entries
+  // Determine the perfscript contains hybrid samples(call stack + LBRs) by
+  // checking whether there is a non-empty call stack immediately followed by
+  // a LBR sample
+  static PerfScriptType checkPerfScriptType(StringRef FileName) {
+    TraceStream TraceIt(FileName);
+    uint64_t FrameAddr = 0;
+    while (!TraceIt.isAtEoF()) {
+      int32_t Count = 0;
+      while (!TraceIt.isAtEoF() &&
+             !TraceIt.getCurrentLine().ltrim().getAsInteger(16, FrameAddr)) {
+        Count++;
+        TraceIt.advance();
+      }
+      if (!TraceIt.isAtEoF()) {
+        if (isLBRSample(TraceIt.getCurrentLine())) {
+          if (Count > 0)
+            return PERF_LBR_STACK;
+          else
+            return PERF_LBR;
+        }
+        TraceIt.advance();
+      }
+    }
+    return PERF_INVALID;
   }
 
   // The parsed MMap event
@@ -540,6 +574,9 @@ class PerfReader {
   }
 
 private:
+  /// Validate the command line input
+  void validateCommandLine(cl::list<std::string> &BinaryFilenames,
+                           cl::list<std::string> &PerfTraceFilenames);
   /// Parse a single line of a PERF_RECORD_MMAP2 event looking for a
   /// mapping between the binary name and its memory layout.
   ///
@@ -574,7 +611,7 @@ class PerfReader {
   BinarySampleCounterMap BinarySampleCounters;
   // Samples with the repeating time generated by the perf reader
   AggregatedCounter AggregatedSamples;
-  PerfScriptType PerfType;
+  PerfScriptType PerfType = PERF_UNKNOWN;
 };
 
 } // end namespace sampleprof
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
index 0a7dddc06bfc..553ea71ea1fc 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -11,6 +11,8 @@
 static cl::opt<std::string> OutputFilename("output", cl::value_desc("output"),
                                            cl::Required,
                                            cl::desc("Output profile file"));
+static cl::alias OutputA("o", cl::desc("Alias for --output"),
+                         cl::aliasopt(OutputFilename));
 
 static cl::opt<SampleProfileFormat> OutputFormat(
     "format", cl::desc("Format of output profile"), cl::init(SPF_Text),
diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.cpp b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
index e1549b14bf05..d7588d680cca 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.cpp
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
@@ -22,16 +22,15 @@
 using namespace llvm;
 using namespace sampleprof;
 
-static cl::opt<bool> ShowDisassembly("show-disassembly", cl::ReallyHidden,
-                                     cl::init(false), cl::ZeroOrMore,
-                                     cl::desc("Print disassembled code."));
+cl::opt<bool> ShowDisassemblyOnly("show-disassembly-only", cl::ReallyHidden,
+                                  cl::init(false), cl::ZeroOrMore,
+                                  cl::desc("Print disassembled code."));
 
-static cl::opt<bool> ShowSourceLocations("show-source-locations",
-                                         cl::ReallyHidden, cl::init(false),
-                                         cl::ZeroOrMore,
-                                         cl::desc("Print source locations."));
+cl::opt<bool> ShowSourceLocations("show-source-locations", cl::ReallyHidden,
+                                  cl::init(false), cl::ZeroOrMore,
+                                  cl::desc("Print source locations."));
 
-static cl::opt<bool> ShowPseudoProbe(
+cl::opt<bool> ShowPseudoProbe(
     "show-pseudo-probe", cl::ReallyHidden, cl::init(false), cl::ZeroOrMore,
     cl::desc("Print pseudo probe section and disassembled info."));
 
@@ -199,7 +198,6 @@ void ProfiledBinary::decodePseudoProbe(const ELFObjectFileBase *Obj) {
 bool ProfiledBinary::dissassembleSymbol(std::size_t SI, ArrayRef<uint8_t> Bytes,
                                         SectionSymbolsTy &Symbols,
                                         const SectionRef &Section) {
-
   std::size_t SE = Symbols.size();
   uint64_t SectionOffset = Section.getAddress() - PreferredBaseAddress;
   uint64_t SectSize = Section.getSize();
@@ -211,7 +209,7 @@ bool ProfiledBinary::dissassembleSymbol(std::size_t SI, ArrayRef<uint8_t> Bytes,
     return true;
 
   std::string &&SymbolName = Symbols[SI].Name.str();
-  if (ShowDisassembly)
+  if (ShowDisassemblyOnly)
     outs() << '<' << SymbolName << ">:\n";
 
   uint64_t Offset = StartOffset;
@@ -223,7 +221,7 @@ bool ProfiledBinary::dissassembleSymbol(std::size_t SI, ArrayRef<uint8_t> Bytes,
                                 Offset + PreferredBaseAddress, nulls()))
       return false;
 
-    if (ShowDisassembly) {
+    if (ShowDisassemblyOnly) {
       if (ShowPseudoProbe) {
         ProbeDecoder.printProbeForAddress(outs(),
                                           Offset + PreferredBaseAddress);
@@ -257,7 +255,7 @@ bool ProfiledBinary::dissassembleSymbol(std::size_t SI, ArrayRef<uint8_t> Bytes,
     Offset += Size;
   }
 
-  if (ShowDisassembly)
+  if (ShowDisassemblyOnly)
     outs() << "\n";
 
   FuncStartAddrMap[StartOffset] = Symbols[SI].Name.str();
@@ -323,7 +321,7 @@ void ProfiledBinary::disassemble(const ELFObjectFileBase *Obj) {
   for (std::pair<const SectionRef, SectionSymbolsTy> &SecSyms : AllSymbols)
     stable_sort(SecSyms.second);
 
-  if (ShowDisassembly)
+  if (ShowDisassemblyOnly)
     outs() << "\nDisassembly of " << FileName << ":\n";
 
   // Dissassemble a text section.
@@ -342,7 +340,7 @@ void ProfiledBinary::disassemble(const ELFObjectFileBase *Obj) {
     // Register the text section.
     TextSections.insert({SectionOffset, SectSize});
 
-    if (ShowDisassembly) {
+    if (ShowDisassemblyOnly) {
       StringRef SectionName = unwrapOrError(Section.getName(), FileName);
       outs() << "\nDisassembly of section " << SectionName;
       outs() << " [" << format("0x%" PRIx64, SectionOffset) << ", "
diff --git a/llvm/tools/llvm-profgen/llvm-profgen.cpp b/llvm/tools/llvm-profgen/llvm-profgen.cpp
index 0f4d8f015439..081f1bb4fcf4 100644
--- a/llvm/tools/llvm-profgen/llvm-profgen.cpp
+++ b/llvm/tools/llvm-profgen/llvm-profgen.cpp
@@ -29,6 +29,8 @@ static cl::list<std::string>
                     llvm::cl::MiscFlags::CommaSeparated,
                     cl::desc("Path of profiled binary files"));
 
+extern cl::opt<bool> ShowDisassemblyOnly;
+
 using namespace llvm;
 using namespace sampleprof;
 
@@ -43,7 +45,9 @@ int main(int argc, const char *argv[]) {
   cl::ParseCommandLineOptions(argc, argv, "llvm SPGO profile generator\n");
 
   // Load binaries and parse perf events and samples
-  PerfReader Reader(BinaryFilenames);
+  PerfReader Reader(BinaryFilenames, PerfTraceFilenames);
+  if (ShowDisassemblyOnly)
+    return EXIT_SUCCESS;
   Reader.parsePerfTraces(PerfTraceFilenames);
 
   std::unique_ptr<ProfileGenerator> Generator = ProfileGenerator::create(

From 610b51c04d3ca6b58555fa30ae52ad9762f9cf86 Mon Sep 17 00:00:00 2001
From: wlei <wlei@fb.com>
Date: Wed, 10 Feb 2021 10:04:39 -0800
Subject: [PATCH 149/318] [CSSPGO][llvm-profgen] Filter out the instructions
 without location info for symbolizer

It appears some instructions doesn't have the debug location info and the symbolizer will return an empty call stack for them which will cause some crash later in profile unwinding. Actually we do not record the sample info for them, so this change just filter out those instruction.

As those instruction would appears at the begin and end of the instruction list, without them we need to add the boundary check for IP `advance` and `backward`.

Also for pseudo probe based profile, we actually don't need the symbolized location info, so here just change to use an empty stack for it. This could save half of the binary loading time.

Differential Revision: https://reviews.llvm.org/D96434
---
 .../tools/llvm-profgen/inline-cs-noprobe.test |  4 +--
 .../llvm-profgen/noinline-cs-noprobe.test     |  8 ++---
 .../recursion-compression-noprobe.test        | 30 +++++++++----------
 llvm/tools/llvm-profgen/PerfReader.cpp        |  4 +++
 llvm/tools/llvm-profgen/ProfileGenerator.cpp  | 21 ++++++++-----
 llvm/tools/llvm-profgen/ProfiledBinary.cpp    | 18 ++++++++---
 llvm/tools/llvm-profgen/ProfiledBinary.h      |  9 ++++--
 7 files changed, 58 insertions(+), 36 deletions(-)

diff --git a/llvm/test/tools/llvm-profgen/inline-cs-noprobe.test b/llvm/test/tools/llvm-profgen/inline-cs-noprobe.test
index d8cc1932f877..cb562e347a3e 100644
--- a/llvm/test/tools/llvm-profgen/inline-cs-noprobe.test
+++ b/llvm/test/tools/llvm-profgen/inline-cs-noprobe.test
@@ -1,12 +1,12 @@
 ; RUN: llvm-profgen --perfscript=%S/Inputs/inline-cs-noprobe.perfscript --binary=%S/Inputs/inline-cs-noprobe.perfbin --output=%t --show-unwinder-output --csprof-cold-thres=0 | FileCheck %s --check-prefix=CHECK-UNWINDER
 ; RUN: FileCheck %s --input-file %t
 
-; CHECK:[main:1 @ foo]:44:0
+; CHECK:[main:1 @ foo]:309:0
 ; CHECK: 2.1: 14
 ; CHECK: 3: 15
 ; CHECK: 3.1: 14 bar:14
 ; CHECK: 3.2: 1
-; CHECK:[main:1 @ foo:3.1 @ bar]:14:0
+; CHECK:[main:1 @ foo:3.1 @ bar]:84:0
 ; CHECK: 1: 14
 
 ; CHECK-UNWINDER: Binary(inline-cs-noprobe.perfbin)'s Range Counter:
diff --git a/llvm/test/tools/llvm-profgen/noinline-cs-noprobe.test b/llvm/test/tools/llvm-profgen/noinline-cs-noprobe.test
index 9d5c787e7f92..c5e6dc1111ca 100644
--- a/llvm/test/tools/llvm-profgen/noinline-cs-noprobe.test
+++ b/llvm/test/tools/llvm-profgen/noinline-cs-noprobe.test
@@ -1,15 +1,15 @@
 ; RUN: llvm-profgen --perfscript=%S/Inputs/noinline-cs-noprobe.perfscript --binary=%S/Inputs/noinline-cs-noprobe.perfbin --output=%t --show-unwinder-output --csprof-cold-thres=0 | FileCheck %s --check-prefix=CHECK-UNWINDER
 ; RUN: FileCheck %s --input-file %t
 
-; CHECK:[main:1 @ foo:3 @ bar]:12:3
+; CHECK:[main:1 @ foo]:54:0
+; CHECK: 2: 3
+; CHECK: 3: 3 bar:3
+; CHECK:[main:1 @ foo:3 @ bar]:50:3
 ; CHECK: 0: 3
 ; CHECK: 1: 3
 ; CHECK: 2: 2
 ; CHECK: 4: 1
 ; CHECK: 5: 3
-; CHECK:[main:1 @ foo]:6:0
-; CHECK: 2: 3
-; CHECK: 3: 3 bar:3
 
 ; CHECK-UNWINDER: Binary(noinline-cs-noprobe.perfbin)'s Range Counter:
 ; CHECK-UNWINDER: main:1 @ foo
diff --git a/llvm/test/tools/llvm-profgen/recursion-compression-noprobe.test b/llvm/test/tools/llvm-profgen/recursion-compression-noprobe.test
index 03bab8407435..15bdd870879e 100644
--- a/llvm/test/tools/llvm-profgen/recursion-compression-noprobe.test
+++ b/llvm/test/tools/llvm-profgen/recursion-compression-noprobe.test
@@ -4,38 +4,38 @@
 ; RUN: llvm-profgen --perfscript=%S/Inputs/recursion-compression-noprobe.perfscript --binary=%S/Inputs/recursion-compression-noprobe.perfbin --output=%t --csprof-cold-thres=0
 ; RUN: FileCheck %s --input-file %t
 
-; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa]:14:0
-; CHECK-UNCOMPRESS: 1: 1
-; CHECK-UNCOMPRESS: 2: 13 fb:11
-; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa:2 @ fb]:12:0
+; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa:2 @ fb]:48:0
 ; CHECK-UNCOMPRESS: 1: 11
 ; CHECK-UNCOMPRESS: 2: 1 fa:1
-; CHECK-UNCOMPRESS:[main:1 @ foo]:3:0
+; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa]:24:0
+; CHECK-UNCOMPRESS: 1: 1
+; CHECK-UNCOMPRESS: 2: 13 fb:11
+; CHECK-UNCOMPRESS:[main:1 @ foo]:7:0
 ; CHECK-UNCOMPRESS: 2: 1
 ; CHECK-UNCOMPRESS: 3: 2 fa:1
-; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa:2 @ fb:2 @ fa]:3:0
+; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa:2 @ fb:2 @ fa]:7:0
 ; CHECK-UNCOMPRESS: 1: 1
 ; CHECK-UNCOMPRESS: 2: 2 fb:1
-; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa:2 @ fb:2 @ fa:2 @ fb]:1:0
+; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa:2 @ fb:2 @ fa:2 @ fb]:2:0
 ; CHECK-UNCOMPRESS: 2: 1 fa:1
-; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa:2 @ fb:2 @ fa:2 @ fb:2 @ fa]:1:0
+; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa:2 @ fb:2 @ fa:2 @ fb:2 @ fa]:2:0
 ; CHECK-UNCOMPRESS: 4: 1
 
 
-; CHECK: [main:1 @ foo:3 @ fa]:14:0
-; CHECK:  1: 1
-; CHECK:  2: 13 fb:11
-; CHECK: [main:1 @ foo:3 @ fa:2 @ fb]:12:0
+; CHECK: [main:1 @ foo:3 @ fa:2 @ fb]:48:0
 ; CHECK:  1: 11
 ; CHECK:  2: 1 fa:1
-; CHECK: [main:1 @ foo:3 @ fa:2 @ fb:2 @ fa]:4:0
+; CHECK: [main:1 @ foo:3 @ fa]:24:0
+; CHECK:  1: 1
+; CHECK:  2: 13 fb:11
+; CHECK: [main:1 @ foo:3 @ fa:2 @ fb:2 @ fa]:9:0
 ; CHECK:  1: 1
 ; CHECK:  2: 2 fb:1
 ; CHECK:  4: 1
-; CHECK: [main:1 @ foo]:3:0
+; CHECK: [main:1 @ foo]:7:0
 ; CHECK:  2: 1
 ; CHECK:  3: 2 fa:1
-; CHECK: [main:1 @ foo:3 @ fa:2 @ fb:2 @ fa:2 @ fb]:0:0
+; CHECK: [main:1 @ foo:3 @ fa:2 @ fb:2 @ fa:2 @ fb]:1:0
 
 
 ; original code:
diff --git a/llvm/tools/llvm-profgen/PerfReader.cpp b/llvm/tools/llvm-profgen/PerfReader.cpp
index 2e0b71f38e6d..1f842008db42 100644
--- a/llvm/tools/llvm-profgen/PerfReader.cpp
+++ b/llvm/tools/llvm-profgen/PerfReader.cpp
@@ -93,6 +93,8 @@ std::shared_ptr<StringBasedCtxKey> FrameStack::getContextKey() {
   std::shared_ptr<StringBasedCtxKey> KeyStr =
       std::make_shared<StringBasedCtxKey>();
   KeyStr->Context = Binary->getExpandedContextStr(Stack);
+  if (KeyStr->Context.empty())
+    return nullptr;
   KeyStr->genHashCode();
   return KeyStr;
 }
@@ -116,6 +118,8 @@ void VirtualUnwinder::collectSamplesFromFrame(UnwindState::ProfiledFrame *Cur,
     return;
 
   std::shared_ptr<ContextKey> Key = Stack.getContextKey();
+  if (Key == nullptr)
+    return;
   auto Ret = CtxCounterMap->emplace(Hashable<ContextKey>(Key), SampleCounter());
   SampleCounter &SCounter = Ret.first->second;
   for (auto &Item : Cur->RangeSamples) {
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
index 553ea71ea1fc..4cfadffebb18 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -212,7 +212,6 @@ void CSProfileGenerator::updateBodySamplesforFunctionProfile(
     FunctionProfile.addBodySamples(LeafLoc.second.LineOffset,
                                    LeafLoc.second.Discriminator,
                                    Count - PreviousCount);
-    FunctionProfile.addTotalSamples(Count - PreviousCount);
   }
 }
 
@@ -242,9 +241,13 @@ void CSProfileGenerator::populateFunctionBodySamples(
 
     while (IP.Address <= RangeEnd) {
       uint64_t Offset = Binary->virtualAddrToOffset(IP.Address);
-      const FrameLocation &LeafLoc = Binary->getInlineLeafFrameLoc(Offset);
-      // Recording body sample for this specific context
-      updateBodySamplesforFunctionProfile(FunctionProfile, LeafLoc, Count);
+      auto LeafLoc = Binary->getInlineLeafFrameLoc(Offset);
+      if (LeafLoc.hasValue()) {
+        // Recording body sample for this specific context
+        updateBodySamplesforFunctionProfile(FunctionProfile, *LeafLoc, Count);
+      }
+      // Accumulate total sample count even it's a line with invalid debug info
+      FunctionProfile.addTotalSamples(Count);
       // Move to next IP within the range
       IP.advance();
     }
@@ -266,9 +269,11 @@ void CSProfileGenerator::populateFunctionBoundarySamples(
       continue;
 
     // Record called target sample and its count
-    const FrameLocation &LeafLoc = Binary->getInlineLeafFrameLoc(SourceOffset);
-    FunctionProfile.addCalledTargetSamples(LeafLoc.second.LineOffset,
-                                           LeafLoc.second.Discriminator,
+    auto LeafLoc = Binary->getInlineLeafFrameLoc(SourceOffset);
+    if (!LeafLoc.hasValue())
+      continue;
+    FunctionProfile.addCalledTargetSamples(LeafLoc->second.LineOffset,
+                                           LeafLoc->second.Discriminator,
                                            CalleeName, Count);
 
     // Record head sample for called target(callee)
@@ -277,7 +282,7 @@ void CSProfileGenerator::populateFunctionBoundarySamples(
       OCalleeCtxStr << ContextId.rsplit(" @ ").first.str();
       OCalleeCtxStr << " @ ";
     }
-    OCalleeCtxStr << getCallSite(LeafLoc) << " @ " << CalleeName.str();
+    OCalleeCtxStr << getCallSite(*LeafLoc) << " @ " << CalleeName.str();
 
     FunctionSamples &CalleeProfile =
         getFunctionProfileForContext(OCalleeCtxStr.str());
diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.cpp b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
index d7588d680cca..df6ef2a7699b 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.cpp
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
@@ -119,7 +119,8 @@ bool ProfiledBinary::inlineContextEqual(uint64_t Address1,
   const FrameLocationStack &Context2 = getFrameLocationStack(Offset2);
   if (Context1.size() != Context2.size())
     return false;
-
+  if (Context1.empty())
+    return false;
   // The leaf frame contains location within the leaf, and it
   // needs to be remove that as it's not part of the calling context
   return std::equal(Context1.begin(), Context1.begin() + Context1.size() - 1,
@@ -134,6 +135,10 @@ std::string ProfiledBinary::getExpandedContextStr(
   for (auto Address : Stack) {
     uint64_t Offset = virtualAddrToOffset(Address);
     const FrameLocationStack &ExpandedContext = getFrameLocationStack(Offset);
+    // An instruction without a valid debug line will be ignored by sample
+    // processing
+    if (ExpandedContext.empty())
+      return std::string();
     for (const auto &Loc : ExpandedContext) {
       ContextVec.push_back(getCallSite(Loc));
     }
@@ -242,9 +247,14 @@ bool ProfiledBinary::dissassembleSymbol(std::size_t SI, ArrayRef<uint8_t> Bytes,
     const MCInstrDesc &MCDesc = MII->get(Inst.getOpcode());
 
     // Populate a vector of the symbolized callsite at this location
-    InstructionPointer IP(this, Offset);
-    Offset2LocStackMap[Offset] = symbolize(IP, true);
-
+    // We don't need symbolized info for probe-based profile, just use an empty
+    // stack as an entry to indicate a valid binary offset
+    FrameLocationStack SymbolizedCallStack;
+    if (!UsePseudoProbes) {
+      InstructionPointer IP(this, Offset);
+      SymbolizedCallStack = symbolize(IP, true);
+    }
+    Offset2LocStackMap[Offset] = SymbolizedCallStack;
     // Populate address maps.
     CodeAddrs.push_back(Offset);
     if (MCDesc.isCall())
diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.h b/llvm/tools/llvm-profgen/ProfiledBinary.h
index f6c7460e186d..ccb1c9d32e46 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.h
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.h
@@ -11,6 +11,7 @@
 
 #include "CallContext.h"
 #include "PseudoProbe.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/Symbolize/Symbolize.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -225,9 +226,11 @@ class ProfiledBinary {
     return FuncStartAddrMap[Offset];
   }
 
-  const FrameLocation &getInlineLeafFrameLoc(uint64_t Offset,
-                                             bool NameOnly = false) {
-    return getFrameLocationStack(Offset).back();
+  Optional<const FrameLocation> getInlineLeafFrameLoc(uint64_t Offset) {
+    const auto &Stack = getFrameLocationStack(Offset);
+    if (Stack.empty())
+      return {};
+    return Stack.back();
   }
 
   // Compare two addresses' inline context

From b5b31112bf63debaa905e42785317a947e696252 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 19 Feb 2021 21:48:46 -0800
Subject: [PATCH 150/318] [clang] Add -ffinite-loops & -fno-finite-loops
 options.

This cherry-picks the following patches on the release branch:

6280bb4cd80e [clang] Remove redundant condition (NFC).
51bf4c0e6d4c [clang] Add -ffinite-loops & -fno-finite-loops options.
fb4d8fe80701 [clang] Update mustprogress tests

This patch adds 2 new options to control when Clang adds `mustprogress`:

  1. -ffinite-loops: assume all loops are finite; mustprogress is added
     to all loops, regardless of the selected language standard.
  2. -fno-finite-loops: assume no loop is finite; mustprogress is not
     added to any loop or function. We could add mustprogress to
     functions without loops, but we would have to detect that in Clang,
     which is probably not worth it.

Differential Revision: https://reviews.llvm.org/D96850
---
 clang/include/clang/Basic/CodeGenOptions.def |   3 +
 clang/include/clang/Basic/CodeGenOptions.h   |   6 +
 clang/include/clang/Driver/Options.td        |   5 +
 clang/lib/CodeGen/CodeGenFunction.h          |  11 +
 clang/lib/Driver/ToolChains/Clang.cpp        |   3 +
 clang/lib/Frontend/CompilerInvocation.cpp    |   5 +-
 clang/test/CodeGen/attr-mustprogress-0.c     | 184 -----------
 clang/test/CodeGen/attr-mustprogress-0.cpp   | 183 ----------
 clang/test/CodeGen/attr-mustprogress-1.c     | 197 -----------
 clang/test/CodeGen/attr-mustprogress-1.cpp   | 271 ---------------
 clang/test/CodeGen/attr-mustprogress.c       | 221 +++++++++++++
 clang/test/CodeGenCXX/attr-mustprogress.cpp  | 330 +++++++++++++++++++
 12 files changed, 583 insertions(+), 836 deletions(-)
 delete mode 100644 clang/test/CodeGen/attr-mustprogress-0.c
 delete mode 100644 clang/test/CodeGen/attr-mustprogress-0.cpp
 delete mode 100644 clang/test/CodeGen/attr-mustprogress-1.c
 delete mode 100644 clang/test/CodeGen/attr-mustprogress-1.cpp
 create mode 100644 clang/test/CodeGen/attr-mustprogress.c
 create mode 100644 clang/test/CodeGenCXX/attr-mustprogress.cpp

diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index 5c8af65326ed..9d53b5b923bb 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -266,6 +266,9 @@ CODEGENOPT(VectorizeLoop     , 1, 0) ///< Run loop vectorizer.
 CODEGENOPT(VectorizeSLP      , 1, 0) ///< Run SLP vectorizer.
 CODEGENOPT(ProfileSampleAccurate, 1, 0) ///< Sample profile is accurate.
 
+/// Treat loops as finite: language, always, never.
+ENUM_CODEGENOPT(FiniteLoops, FiniteLoopsKind, 2, FiniteLoopsKind::Language)
+
   /// Attempt to use register sized accesses to bit-fields in structures, when
   /// possible.
 CODEGENOPT(UseRegisterSizedBitfieldAccess , 1, 0)
diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h
index 73d41e3293c6..c550817f0f69 100644
--- a/clang/include/clang/Basic/CodeGenOptions.h
+++ b/clang/include/clang/Basic/CodeGenOptions.h
@@ -140,6 +140,12 @@ class CodeGenOptions : public CodeGenOptionsBase {
     All,         // Keep all frame pointers.
   };
 
+  enum FiniteLoopsKind {
+    Language, // Not specified, use language standard.
+    Always,   // All loops are assumed to be finite.
+    Never,    // No loop is assumed to be finite.
+  };
+
   /// The code model to use (-mcmodel).
   std::string CodeModel;
 
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 1f6c13d5cc96..817798926650 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2410,6 +2410,11 @@ def fno_unroll_loops : Flag<["-"], "fno-unroll-loops">, Group<f_Group>,
 defm reroll_loops : BoolFOption<"reroll-loops",
   CodeGenOpts<"RerollLoops">, DefaultFalse,
   PosFlag<SetTrue, [CC1Option], "Turn on loop reroller">, NegFlag<SetFalse>>;
+def ffinite_loops: Flag<["-"],  "ffinite-loops">, Group<f_Group>,
+  HelpText<"Assume all loops are finite.">, Flags<[CC1Option]>;
+def fno_finite_loops: Flag<["-"], "fno-finite-loops">, Group<f_Group>,
+  HelpText<"Do not assume that any loop is finite.">, Flags<[CC1Option]>;
+
 def ftrigraphs : Flag<["-"], "ftrigraphs">, Group<f_Group>,
   HelpText<"Process trigraph sequences">, Flags<[CC1Option]>;
 def fno_trigraphs : Flag<["-"], "fno-trigraphs">, Group<f_Group>,
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 8eb7adbc8fcb..95c0b7b4d7c0 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -507,12 +507,23 @@ class CodeGenFunction : public CodeGenTypeCache {
 
   /// True if the C++ Standard Requires Progress.
   bool CPlusPlusWithProgress() {
+    if (CGM.getCodeGenOpts().getFiniteLoops() ==
+        CodeGenOptions::FiniteLoopsKind::Never)
+      return false;
+
     return getLangOpts().CPlusPlus11 || getLangOpts().CPlusPlus14 ||
            getLangOpts().CPlusPlus17 || getLangOpts().CPlusPlus20;
   }
 
   /// True if the C Standard Requires Progress.
   bool CWithProgress() {
+    if (CGM.getCodeGenOpts().getFiniteLoops() ==
+        CodeGenOptions::FiniteLoopsKind::Always)
+      return true;
+    if (CGM.getCodeGenOpts().getFiniteLoops() ==
+        CodeGenOptions::FiniteLoopsKind::Never)
+      return false;
+
     return getLangOpts().C11 || getLangOpts().C17 || getLangOpts().C2x;
   }
 
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index f8e637974662..1976b48e0f6a 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5620,6 +5620,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     if (A->getOption().matches(options::OPT_freroll_loops))
       CmdArgs.push_back("-freroll-loops");
 
+  Args.AddLastArg(CmdArgs, options::OPT_ffinite_loops,
+                  options::OPT_fno_finite_loops);
+
   Args.AddLastArg(CmdArgs, options::OPT_fwritable_strings);
   Args.AddLastArg(CmdArgs, options::OPT_funroll_loops,
                   options::OPT_fno_unroll_loops);
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 036388ebd355..5c5cf46150e2 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -1037,7 +1037,6 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args,
   Opts.UnrollLoops =
       Args.hasFlag(OPT_funroll_loops, OPT_fno_unroll_loops,
                    (Opts.OptimizationLevel > 1));
-
   Opts.BinutilsVersion =
       std::string(Args.getLastArgValue(OPT_fbinutils_version_EQ));
 
@@ -1324,6 +1323,10 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args,
 
   Opts.EmitVersionIdentMetadata = Args.hasFlag(OPT_Qy, OPT_Qn, true);
 
+  if (Args.hasArg(options::OPT_ffinite_loops))
+    Opts.FiniteLoops = CodeGenOptions::FiniteLoopsKind::Always;
+  else if (Args.hasArg(options::OPT_fno_finite_loops))
+    Opts.FiniteLoops = CodeGenOptions::FiniteLoopsKind::Never;
   return Success;
 }
 
diff --git a/clang/test/CodeGen/attr-mustprogress-0.c b/clang/test/CodeGen/attr-mustprogress-0.c
deleted file mode 100644
index 2af24e88ceef..000000000000
--- a/clang/test/CodeGen/attr-mustprogress-0.c
+++ /dev/null
@@ -1,184 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-attributes
-// RUN: %clang_cc1 -std=c89 -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -std=c99 -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck %s
-
-int a = 0;
-int b = 0;
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @f1(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[FOR_COND:%.*]]
-// CHECK:       for.cond:
-// CHECK-NEXT:    br i1 true, label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
-// CHECK:       for.body:
-// CHECK-NEXT:    br label [[FOR_COND]]
-// CHECK:       for.end:
-// CHECK-NEXT:    ret void
-//
-void f1() {
-  for (; 1;) {
-  }
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @f2(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[FOR_COND:%.*]]
-// CHECK:       for.cond:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
-// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
-// CHECK:       for.body:
-// CHECK-NEXT:    br label [[FOR_COND]]
-// CHECK:       for.end:
-// CHECK-NEXT:    ret void
-//
-void f2() {
-  for (; a == b;) {
-  }
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @F(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[FOR_COND:%.*]]
-// CHECK:       for.cond:
-// CHECK-NEXT:    br i1 true, label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
-// CHECK:       for.body:
-// CHECK-NEXT:    br label [[FOR_COND]]
-// CHECK:       for.end:
-// CHECK-NEXT:    br label [[FOR_COND1:%.*]]
-// CHECK:       for.cond1:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
-// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY2:%.*]], label [[FOR_END3:%.*]]
-// CHECK:       for.body2:
-// CHECK-NEXT:    br label [[FOR_COND1]]
-// CHECK:       for.end3:
-// CHECK-NEXT:    ret void
-//
-void F() {
-  for (; 1;) {
-  }
-  for (; a == b;) {
-  }
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @w1(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-// CHECK:       while.body:
-// CHECK-NEXT:    br label [[WHILE_BODY]]
-//
-void w1() {
-  while (1) {
-  }
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @w2(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[WHILE_COND:%.*]]
-// CHECK:       while.cond:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
-// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    br i1 [[CMP]], label [[WHILE_BODY:%.*]], label [[WHILE_END:%.*]]
-// CHECK:       while.body:
-// CHECK-NEXT:    br label [[WHILE_COND]]
-// CHECK:       while.end:
-// CHECK-NEXT:    ret void
-//
-void w2() {
-  while (a == b) {
-  }
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @W(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[WHILE_COND:%.*]]
-// CHECK:       while.cond:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
-// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    br i1 [[CMP]], label [[WHILE_BODY:%.*]], label [[WHILE_END:%.*]]
-// CHECK:       while.body:
-// CHECK-NEXT:    br label [[WHILE_COND]]
-// CHECK:       while.end:
-// CHECK-NEXT:    br label [[WHILE_BODY2:%.*]]
-// CHECK:       while.body2:
-// CHECK-NEXT:    br label [[WHILE_BODY2]]
-//
-void W() {
-  while (a == b) {
-  }
-  while (1) {
-  }
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @d1(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[DO_BODY:%.*]]
-// CHECK:       do.body:
-// CHECK-NEXT:    br label [[DO_COND:%.*]]
-// CHECK:       do.cond:
-// CHECK-NEXT:    br i1 true, label [[DO_BODY]], label [[DO_END:%.*]]
-// CHECK:       do.end:
-// CHECK-NEXT:    ret void
-//
-void d1() {
-  do {
-  } while (1);
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @d2(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[DO_BODY:%.*]]
-// CHECK:       do.body:
-// CHECK-NEXT:    br label [[DO_COND:%.*]]
-// CHECK:       do.cond:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
-// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    br i1 [[CMP]], label [[DO_BODY]], label [[DO_END:%.*]]
-// CHECK:       do.end:
-// CHECK-NEXT:    ret void
-//
-void d2() {
-  do {
-  } while (a == b);
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @D(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[DO_BODY:%.*]]
-// CHECK:       do.body:
-// CHECK-NEXT:    br label [[DO_COND:%.*]]
-// CHECK:       do.cond:
-// CHECK-NEXT:    br i1 true, label [[DO_BODY]], label [[DO_END:%.*]]
-// CHECK:       do.end:
-// CHECK-NEXT:    br label [[DO_BODY1:%.*]]
-// CHECK:       do.body1:
-// CHECK-NEXT:    br label [[DO_COND2:%.*]]
-// CHECK:       do.cond2:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
-// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    br i1 [[CMP]], label [[DO_BODY1]], label [[DO_END3:%.*]]
-// CHECK:       do.end3:
-// CHECK-NEXT:    ret void
-//
-void D() {
-  do {
-  } while (1);
-  do {
-  } while (a == b);
-}
diff --git a/clang/test/CodeGen/attr-mustprogress-0.cpp b/clang/test/CodeGen/attr-mustprogress-0.cpp
deleted file mode 100644
index 3a180cc6b5ad..000000000000
--- a/clang/test/CodeGen/attr-mustprogress-0.cpp
+++ /dev/null
@@ -1,183 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-attributes
-// RUN: %clang_cc1 -std=c++98 -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck %s
-
-int a = 0;
-int b = 0;
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @_Z2f1v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[FOR_COND:%.*]]
-// CHECK:       for.cond:
-// CHECK-NEXT:    br i1 true, label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
-// CHECK:       for.body:
-// CHECK-NEXT:    br label [[FOR_COND]]
-// CHECK:       for.end:
-// CHECK-NEXT:    ret void
-//
-void f1() {
-  for (; 1;) {
-  }
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @_Z2f2v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[FOR_COND:%.*]]
-// CHECK:       for.cond:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
-// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
-// CHECK:       for.body:
-// CHECK-NEXT:    br label [[FOR_COND]]
-// CHECK:       for.end:
-// CHECK-NEXT:    ret void
-//
-void f2() {
-  for (; a == b;) {
-  }
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @_Z1Fv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[FOR_COND:%.*]]
-// CHECK:       for.cond:
-// CHECK-NEXT:    br i1 true, label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
-// CHECK:       for.body:
-// CHECK-NEXT:    br label [[FOR_COND]]
-// CHECK:       for.end:
-// CHECK-NEXT:    br label [[FOR_COND1:%.*]]
-// CHECK:       for.cond1:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
-// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY2:%.*]], label [[FOR_END3:%.*]]
-// CHECK:       for.body2:
-// CHECK-NEXT:    br label [[FOR_COND1]]
-// CHECK:       for.end3:
-// CHECK-NEXT:    ret void
-//
-void F() {
-  for (; 1;) {
-  }
-  for (; a == b;) {
-  }
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @_Z2w1v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-// CHECK:       while.body:
-// CHECK-NEXT:    br label [[WHILE_BODY]]
-//
-void w1() {
-  while (1) {
-  }
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @_Z2w2v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[WHILE_COND:%.*]]
-// CHECK:       while.cond:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
-// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    br i1 [[CMP]], label [[WHILE_BODY:%.*]], label [[WHILE_END:%.*]]
-// CHECK:       while.body:
-// CHECK-NEXT:    br label [[WHILE_COND]]
-// CHECK:       while.end:
-// CHECK-NEXT:    ret void
-//
-void w2() {
-  while (a == b) {
-  }
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @_Z1Wv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[WHILE_COND:%.*]]
-// CHECK:       while.cond:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
-// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    br i1 [[CMP]], label [[WHILE_BODY:%.*]], label [[WHILE_END:%.*]]
-// CHECK:       while.body:
-// CHECK-NEXT:    br label [[WHILE_COND]]
-// CHECK:       while.end:
-// CHECK-NEXT:    br label [[WHILE_BODY2:%.*]]
-// CHECK:       while.body2:
-// CHECK-NEXT:    br label [[WHILE_BODY2]]
-//
-void W() {
-  while (a == b) {
-  }
-  while (1) {
-  }
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @_Z2d1v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[DO_BODY:%.*]]
-// CHECK:       do.body:
-// CHECK-NEXT:    br label [[DO_COND:%.*]]
-// CHECK:       do.cond:
-// CHECK-NEXT:    br i1 true, label [[DO_BODY]], label [[DO_END:%.*]]
-// CHECK:       do.end:
-// CHECK-NEXT:    ret void
-//
-void d1() {
-  do {
-  } while (1);
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @_Z2d2v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[DO_BODY:%.*]]
-// CHECK:       do.body:
-// CHECK-NEXT:    br label [[DO_COND:%.*]]
-// CHECK:       do.cond:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
-// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    br i1 [[CMP]], label [[DO_BODY]], label [[DO_END:%.*]]
-// CHECK:       do.end:
-// CHECK-NEXT:    ret void
-//
-void d2() {
-  do {
-  } while (a == b);
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @_Z1Dv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[DO_BODY:%.*]]
-// CHECK:       do.body:
-// CHECK-NEXT:    br label [[DO_COND:%.*]]
-// CHECK:       do.cond:
-// CHECK-NEXT:    br i1 true, label [[DO_BODY]], label [[DO_END:%.*]]
-// CHECK:       do.end:
-// CHECK-NEXT:    br label [[DO_BODY1:%.*]]
-// CHECK:       do.body1:
-// CHECK-NEXT:    br label [[DO_COND2:%.*]]
-// CHECK:       do.cond2:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
-// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    br i1 [[CMP]], label [[DO_BODY1]], label [[DO_END3:%.*]]
-// CHECK:       do.end3:
-// CHECK-NEXT:    ret void
-//
-void D() {
-  do {
-  } while (1);
-  do {
-  } while (a == b);
-}
diff --git a/clang/test/CodeGen/attr-mustprogress-1.c b/clang/test/CodeGen/attr-mustprogress-1.c
deleted file mode 100644
index 2ff068b8b90a..000000000000
--- a/clang/test/CodeGen/attr-mustprogress-1.c
+++ /dev/null
@@ -1,197 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-attributes
-// RUN: %clang_cc1 -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -std=c11 -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -std=c18 -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -std=c2x -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck %s
-
-int a = 0;
-int b = 0;
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @f0(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[FOR_COND:%.*]]
-// CHECK:       for.cond:
-// CHECK-NOT:    br label [[FOR_COND]], !llvm.loop !{{.*}}
-//
-void f0() {
-  for (; ;) ;
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @f1(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[FOR_COND:%.*]]
-// CHECK:       for.cond:
-// CHECK-NEXT:    br i1 true, label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
-// CHECK:       for.body:
-// CHECK-NEXT:    br label [[FOR_COND]]
-// CHECK:       for.end:
-// CHECK-NEXT:    ret void
-//
-void f1() {
-  for (; 1;) {
-  }
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @f2(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[FOR_COND:%.*]]
-// CHECK:       for.cond:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
-// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
-// CHECK:       for.body:
-// CHECK-NEXT:    br label [[FOR_COND]], [[LOOP2:!llvm.loop !.*]]
-// CHECK:       for.end:
-// CHECK-NEXT:    ret void
-//
-void f2() {
-  for (; a == b;) {
-  }
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @F(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[FOR_COND:%.*]]
-// CHECK:       for.cond:
-// CHECK-NEXT:    br i1 true, label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
-// CHECK:       for.body:
-// CHECK-NEXT:    br label [[FOR_COND]]
-// CHECK:       for.end:
-// CHECK-NEXT:    br label [[FOR_COND1:%.*]]
-// CHECK:       for.cond1:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
-// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY2:%.*]], label [[FOR_END3:%.*]]
-// CHECK:       for.body2:
-// CHECK-NEXT:    br label [[FOR_COND1]], [[LOOP4:!llvm.loop !.*]]
-// CHECK:       for.end3:
-// CHECK-NEXT:    ret void
-//
-void F() {
-  for (; 1;) {
-  }
-  for (; a == b;) {
-  }
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @w1(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-// CHECK:       while.body:
-// CHECK-NEXT:    br label [[WHILE_BODY]]
-//
-void w1() {
-  while (1) {
-  }
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @w2(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[WHILE_COND:%.*]]
-// CHECK:       while.cond:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
-// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    br i1 [[CMP]], label [[WHILE_BODY:%.*]], label [[WHILE_END:%.*]]
-// CHECK:       while.body:
-// CHECK-NEXT:    br label [[WHILE_COND]], [[LOOP5:!llvm.loop !.*]]
-// CHECK:       while.end:
-// CHECK-NEXT:    ret void
-//
-void w2() {
-  while (a == b) {
-  }
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @W(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[WHILE_COND:%.*]]
-// CHECK:       while.cond:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
-// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    br i1 [[CMP]], label [[WHILE_BODY:%.*]], label [[WHILE_END:%.*]]
-// CHECK:       while.body:
-// CHECK-NEXT:    br label [[WHILE_COND]], [[LOOP6:!llvm.loop !.*]]
-// CHECK:       while.end:
-// CHECK-NEXT:    br label [[WHILE_BODY2:%.*]]
-// CHECK:       while.body2:
-// CHECK-NEXT:    br label [[WHILE_BODY2]]
-//
-void W() {
-  while (a == b) {
-  }
-  while (1) {
-  }
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @d1(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[DO_BODY:%.*]]
-// CHECK:       do.body:
-// CHECK-NEXT:    br label [[DO_COND:%.*]]
-// CHECK:       do.cond:
-// CHECK-NEXT:    br i1 true, label [[DO_BODY]], label [[DO_END:%.*]]
-// CHECK:       do.end:
-// CHECK-NEXT:    ret void
-//
-void d1() {
-  do {
-  } while (1);
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @d2(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[DO_BODY:%.*]]
-// CHECK:       do.body:
-// CHECK-NEXT:    br label [[DO_COND:%.*]]
-// CHECK:       do.cond:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
-// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    br i1 [[CMP]], label [[DO_BODY]], label [[DO_END:%.*]], [[LOOP7:!llvm.loop !.*]]
-// CHECK:       do.end:
-// CHECK-NEXT:    ret void
-//
-void d2() {
-  do {
-  } while (a == b);
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @D(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[DO_BODY:%.*]]
-// CHECK:       do.body:
-// CHECK-NEXT:    br label [[DO_COND:%.*]]
-// CHECK:       do.cond:
-// CHECK-NEXT:    br i1 true, label [[DO_BODY]], label [[DO_END:%.*]]
-// CHECK:       do.end:
-// CHECK-NEXT:    br label [[DO_BODY1:%.*]]
-// CHECK:       do.body1:
-// CHECK-NEXT:    br label [[DO_COND2:%.*]]
-// CHECK:       do.cond2:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
-// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    br i1 [[CMP]], label [[DO_BODY1]], label [[DO_END3:%.*]], [[LOOP8:!llvm.loop !.*]]
-// CHECK:       do.end3:
-// CHECK-NEXT:    ret void
-//
-void D() {
-  do {
-  } while (1);
-  do {
-  } while (a == b);
-}
diff --git a/clang/test/CodeGen/attr-mustprogress-1.cpp b/clang/test/CodeGen/attr-mustprogress-1.cpp
deleted file mode 100644
index 945d74663c6d..000000000000
--- a/clang/test/CodeGen/attr-mustprogress-1.cpp
+++ /dev/null
@@ -1,271 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-attributes
-// RUN: %clang_cc1 -std=c++11 -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -std=c++14 -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -std=c++17 -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -std=c++20 -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck %s
-
-int a = 0;
-int b = 0;
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @_Z2f0v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[FOR_COND:%.*]]
-// CHECK:       for.cond:
-// CHECK-NOT:    br label [[FOR_COND]], !llvm.loop !{{.*}}
-void f0() {
-  for (; ;) ;
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @_Z2f1v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[FOR_COND:%.*]]
-// CHECK:       for.cond:
-// CHECK-NEXT:    br i1 true, label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
-// CHECK:       for.body:
-// CHECK-NEXT:    br label [[FOR_COND]]
-// CHECK:       for.end:
-// CHECK-NEXT:    ret void
-//
-void f1() {
-  for (; 1;)
-    ;
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone mustprogress
-// CHECK-LABEL: @_Z2f2v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[FOR_COND:%.*]]
-// CHECK:       for.cond:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
-// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
-// CHECK:       for.body:
-// CHECK-NEXT:    br label [[FOR_COND]], [[LOOP2:!llvm.loop !.*]]
-// CHECK:       for.end:
-// CHECK-NEXT:    ret void
-//
-void f2() {
-  for (; a == b;)
-    ;
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @_Z1Fv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[FOR_COND:%.*]]
-// CHECK:       for.cond:
-// CHECK-NEXT:    br i1 true, label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
-// CHECK:       for.body:
-// CHECK-NEXT:    br label [[FOR_COND]]
-// CHECK:       for.end:
-// CHECK-NEXT:    br label [[FOR_COND1:%.*]]
-// CHECK:       for.cond1:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
-// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY2:%.*]], label [[FOR_END3:%.*]]
-// CHECK:       for.body2:
-// CHECK-NEXT:    br label [[FOR_COND1]], [[LOOP4:!llvm.loop !.*]]
-// CHECK:       for.end3:
-// CHECK-NEXT:    ret void
-//
-void F() {
-  for (; 1;)
-    ;
-  for (; a == b;)
-    ;
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @_Z2F2v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[FOR_COND:%.*]]
-// CHECK:       for.cond:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
-// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
-// CHECK:       for.body:
-// CHECK-NEXT:    br label [[FOR_COND]], [[LOOP5:!llvm.loop !.*]]
-// CHECK:       for.end:
-// CHECK-NEXT:    br label [[FOR_COND1:%.*]]
-// CHECK:       for.cond1:
-// CHECK-NEXT:    br i1 true, label [[FOR_BODY2:%.*]], label [[FOR_END3:%.*]]
-// CHECK:       for.body2:
-// CHECK-NEXT:    br label [[FOR_COND1]]
-// CHECK:       for.end3:
-// CHECK-NEXT:    ret void
-//
-void F2() {
-  for (; a == b;)
-    ;
-  for (; 1;)
-    ;
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @_Z2w1v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-// CHECK:       while.body:
-// CHECK-NEXT:    br label [[WHILE_BODY]]
-//
-void w1() {
-  while (1)
-    ;
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone mustprogress
-// CHECK-LABEL: @_Z2w2v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[WHILE_COND:%.*]]
-// CHECK:       while.cond:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
-// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    br i1 [[CMP]], label [[WHILE_BODY:%.*]], label [[WHILE_END:%.*]]
-// CHECK:       while.body:
-// CHECK-NEXT:    br label [[WHILE_COND]], [[LOOP6:!llvm.loop !.*]]
-// CHECK:       while.end:
-// CHECK-NEXT:    ret void
-//
-void w2() {
-  while (a == b)
-    ;
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @_Z1Wv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[WHILE_COND:%.*]]
-// CHECK:       while.cond:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
-// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    br i1 [[CMP]], label [[WHILE_BODY:%.*]], label [[WHILE_END:%.*]]
-// CHECK:       while.body:
-// CHECK-NEXT:    br label [[WHILE_COND]], [[LOOP7:!llvm.loop !.*]]
-// CHECK:       while.end:
-// CHECK-NEXT:    br label [[WHILE_BODY2:%.*]]
-// CHECK:       while.body2:
-// CHECK-NEXT:    br label [[WHILE_BODY2]]
-//
-void W() {
-  while (a == b)
-    ;
-  while (1)
-    ;
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @_Z2W2v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-// CHECK:       while.body:
-// CHECK-NEXT:    br label [[WHILE_BODY]]
-//
-void W2() {
-  while (1)
-    ;
-  while (a == b)
-    ;
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @_Z2d1v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[DO_BODY:%.*]]
-// CHECK:       do.body:
-// CHECK-NEXT:    br label [[DO_COND:%.*]]
-// CHECK:       do.cond:
-// CHECK-NEXT:    br i1 true, label [[DO_BODY]], label [[DO_END:%.*]]
-// CHECK:       do.end:
-// CHECK-NEXT:    ret void
-//
-void d1() {
-  do
-    ;
-  while (1);
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone mustprogress
-// CHECK-LABEL: @_Z2d2v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[DO_BODY:%.*]]
-// CHECK:       do.body:
-// CHECK-NEXT:    br label [[DO_COND:%.*]]
-// CHECK:       do.cond:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
-// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    br i1 [[CMP]], label [[DO_BODY]], label [[DO_END:%.*]], [[LOOP8:!llvm.loop !.*]]
-// CHECK:       do.end:
-// CHECK-NEXT:    ret void
-//
-void d2() {
-  do
-    ;
-  while (a == b);
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @_Z1Dv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[DO_BODY:%.*]]
-// CHECK:       do.body:
-// CHECK-NEXT:    br label [[DO_COND:%.*]]
-// CHECK:       do.cond:
-// CHECK-NEXT:    br i1 true, label [[DO_BODY]], label [[DO_END:%.*]]
-// CHECK:       do.end:
-// CHECK-NEXT:    br label [[DO_BODY1:%.*]]
-// CHECK:       do.body1:
-// CHECK-NEXT:    br label [[DO_COND2:%.*]]
-// CHECK:       do.cond2:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
-// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    br i1 [[CMP]], label [[DO_BODY1]], label [[DO_END3:%.*]], [[LOOP9:!llvm.loop !.*]]
-// CHECK:       do.end3:
-// CHECK-NEXT:    ret void
-//
-void D() {
-  do
-    ;
-  while (1);
-  do
-    ;
-  while (a == b);
-}
-
-// CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @_Z2D2v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    br label [[DO_BODY:%.*]]
-// CHECK:       do.body:
-// CHECK-NEXT:    br label [[DO_COND:%.*]]
-// CHECK:       do.cond:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
-// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    br i1 [[CMP]], label [[DO_BODY]], label [[DO_END:%.*]], [[LOOP10:!llvm.loop !.*]]
-// CHECK:       do.end:
-// CHECK-NEXT:    br label [[DO_BODY1:%.*]]
-// CHECK:       do.body1:
-// CHECK-NEXT:    br label [[DO_COND2:%.*]]
-// CHECK:       do.cond2:
-// CHECK-NEXT:    br i1 true, label [[DO_BODY1]], label [[DO_END3:%.*]]
-// CHECK:       do.end3:
-// CHECK-NEXT:    ret void
-//
-void D2() {
-  do
-    ;
-  while (a == b);
-  do
-    ;
-  while (1);
-}
-
diff --git a/clang/test/CodeGen/attr-mustprogress.c b/clang/test/CodeGen/attr-mustprogress.c
new file mode 100644
index 000000000000..1f83cd44b308
--- /dev/null
+++ b/clang/test/CodeGen/attr-mustprogress.c
@@ -0,0 +1,221 @@
+// RUN: %clang_cc1 -std=c99 -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=C99 %s
+// RUN: %clang_cc1 -std=c11 -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=C11 %s
+// RUN: %clang_cc1 -std=c18 -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=C11 %s
+// RUN: %clang_cc1 -std=c2x -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=C11 %s
+//
+// RUN: %clang_cc1 -std=c11 -ffinite-loops -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=FINITE %s
+// RUN: %clang_cc1 -std=c11 -fno-finite-loops -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=C99 %s
+
+int a = 0;
+int b = 0;
+
+// CHECK: datalayout
+//
+// CHECK-NOT: mustprogress
+// CHECK-LABEL: @f0(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    br label %for.cond
+// CHECK:       for.cond:
+// CHECK-NOT:     br {{.*}}!llvm.loop
+//
+void f0() {
+  for (; ;) ;
+}
+
+// CHECK-NOT: mustprogress
+// CHECK-LABEL: @f1(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    br label %for.cond
+// CHECK:       for.cond:
+// CHECK-NEXT:    br i1 true, label %for.body, label %for.end
+// CHECK:       for.body:
+// CHECK-NOT:     br {{.*}}, !llvm.loop
+// CHECK:       for.end:
+// CHECK-NEXT:    ret void
+//
+void f1() {
+  for (; 1;) {
+  }
+}
+
+// CHECK-NOT: mustprogress
+// CHECK-LABEL: @f2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    br label %for.cond
+// CHECK:       for.cond:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
+// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    br i1 [[CMP]], label %for.body, label %for.end
+// CHECK:       for.body:
+// C99-NOT:       br {{.*}} !llvm.loop
+// C11:           br label %for.cond, !llvm.loop [[LOOP1:!.*]]
+// FINITE:        br label %for.cond, !llvm.loop [[LOOP1:!.*]]
+// CHECK:       for.end:
+// CHECK-NEXT:    ret void
+//
+void f2() {
+  for (; a == b;) {
+  }
+}
+
+// CHECK-NOT: mustprogress
+// CHECK-LABEL: @F(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    br label %for.cond
+// CHECK:       for.cond:
+// CHECK-NEXT:    br i1 true, label %for.body, label %for.end
+// CHECK:       for.body:
+// CHECK-NOT:     br {{.*}}, !llvm.loop
+// CHECK:       for.end:
+// CHECK-NEXT:    br label %for.cond1
+// CHECK:       for.cond1:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
+// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    br i1 [[CMP]], label %for.body2, label %for.end3
+// CHECK:       for.body2:
+// C99-NOT:       br {{.*}}, !llvm.loop
+// C11:           br label %for.cond1, !llvm.loop [[LOOP2:!.*]]
+// FINITE:        br label %for.cond1, !llvm.loop [[LOOP2:!.*]]
+// CHECK:       for.end3:
+// CHECK-NEXT:    ret void
+//
+void F() {
+  for (; 1;) {
+  }
+  for (; a == b;) {
+  }
+}
+
+// CHECK-NOT: mustprogress
+// CHECK-LABEL: @w1(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    br label %while.body
+// CHECK:       while.body:
+// CHECK-NOT:     br {{.*}}, !llvm.loop
+//
+void w1() {
+  while (1) {
+  }
+}
+
+// CHECK-NOT: mustprogress
+// CHECK-LABEL: @w2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    br label %while.cond
+// CHECK:       while.cond:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
+// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    br i1 [[CMP]], label %while.body, label %while.end
+// CHECK:       while.body:
+// C99-NOT:       br {{.*}}, !llvm.loop
+// C11:           br label %while.cond, !llvm.loop [[LOOP3:!.*]]
+// FINITE:        br label %while.cond, !llvm.loop [[LOOP3:!.*]]
+// CHECK:       while.end:
+// CHECK-NEXT:    ret void
+//
+void w2() {
+  while (a == b) {
+  }
+}
+
+// CHECK-NOT: mustprogress
+// CHECK-LABEL: @W(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    br label [[WHILE_COND:%.*]]
+// CHECK:       while.cond:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
+// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    br i1 [[CMP]], label %while.body, label %while.end
+// CHECK:       while.body:
+// C99-NOT:       br {{.*}} !llvm.loop
+// C11:           br label %while.cond, !llvm.loop [[LOOP4:!.*]]
+// FINITE:        br label %while.cond, !llvm.loop [[LOOP4:!.*]]
+// CHECK:       while.end:
+// CHECK-NEXT:    br label %while.body2
+// CHECK:       while.body2:
+// CHECK-NOT:     br {{.*}} !llvm.loop
+//
+void W() {
+  while (a == b) {
+  }
+  while (1) {
+  }
+}
+
+// CHECK-NOT: mustprogress
+// CHECK-LABEL: @d1(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    br label %do.body
+// CHECK:       do.body:
+// CHECK-NEXT:    br label %do.cond
+// CHECK:       do.cond:
+// CHECK-NOT:     br {{.*}}, !llvm.loop
+// CHECK:       do.end:
+// CHECK-NEXT:    ret void
+//
+void d1() {
+  do {
+  } while (1);
+}
+
+// CHECK-NOT: mustprogress
+// CHECK-LABEL: @d2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    br label %do.body
+// CHECK:       do.body:
+// CHECK-NEXT:    br label %do.cond
+// CHECK:       do.cond:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
+// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
+// C99-NOT:       br {{.*}}, !llvm.loop
+// C11:           br i1 [[CMP]], label %do.body, label %do.end, !llvm.loop [[LOOP5:!.*]]
+// FINITE:        br i1 [[CMP]], label %do.body, label %do.end, !llvm.loop [[LOOP5:!.*]]
+// CHECK:       do.end:
+// CHECK-NEXT:    ret void
+//
+void d2() {
+  do {
+  } while (a == b);
+}
+
+// CHECK-NOT: mustprogress
+// CHECK-LABEL: @D(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    br label %do.body
+// CHECK:       do.body:
+// CHECK-NEXT:    br label %do.cond
+// CHECK:       do.cond:
+// CHECK-NOT:     br label {{.*}}, !llvm.loop
+// CHECK:       do.end:
+// CHECK-NEXT:    br label %do.body1
+// CHECK:       do.body1:
+// CHECK-NEXT:    br label %do.cond2
+// CHECK:       do.cond2:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
+// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
+// C99-NOT:       br {{.*}}, !llvm.loop
+// C11:           br i1 [[CMP]], label %do.body1, label %do.end3, !llvm.loop [[LOOP6:!.*]]
+// FINITE:        br i1 [[CMP]], label %do.body1, label %do.end3, !llvm.loop [[LOOP6:!.*]]
+// CHECK:       do.end3:
+// CHECK-NEXT:    ret void
+//
+void D() {
+  do {
+  } while (1);
+  do {
+  } while (a == b);
+}
+
+// C11: [[LOOP1]] = distinct !{[[LOOP1]], [[MP:!.*]]}
+// C11: [[MP]] = !{!"llvm.loop.mustprogress"}
+// C11: [[LOOP2]] = distinct !{[[LOOP2]], [[MP]]}
+// C11: [[LOOP3]] = distinct !{[[LOOP3]], [[MP]]}
+// C11: [[LOOP4]] = distinct !{[[LOOP4]], [[MP]]}
+// C11: [[LOOP5]] = distinct !{[[LOOP5]], [[MP]]}
+// C11: [[LOOP6]] = distinct !{[[LOOP6]], [[MP]]}
diff --git a/clang/test/CodeGenCXX/attr-mustprogress.cpp b/clang/test/CodeGenCXX/attr-mustprogress.cpp
new file mode 100644
index 000000000000..48ac7ad938ba
--- /dev/null
+++ b/clang/test/CodeGenCXX/attr-mustprogress.cpp
@@ -0,0 +1,330 @@
+// RUN: %clang_cc1 -std=c++98 -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX98 %s
+// RUN: %clang_cc1 -std=c++11 -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX11 %s
+// RUN: %clang_cc1 -std=c++14 -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX11 %s
+// RUN: %clang_cc1 -std=c++17 -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX11 %s
+// RUN: %clang_cc1 -std=c++20 -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX11 %s
+
+// Make sure -ffinite-loops overrides -std=c++98 for loops.
+// RUN: %clang_cc1 -std=c++98 -ffinite-loops -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=FINITE %s
+
+// Make sure -fno-finite-loops overrides -std=c++11
+// RUN: %clang_cc1 -std=c++11 -fno-finite-loops -triple=x86_64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CXX98 %s
+
+int a = 0;
+int b = 0;
+
+// CHECK: datalayout
+
+// CXX98-NOT: mustprogress
+// CXX11-NOT: mustprogress
+// FINITE-NOT:     mustprogress
+// CHECK-LABEL: @_Z2f0v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    br label %for.cond
+// CHECK:       for.cond:
+// CHECK-NOT:    br {{.*}} llvm.loop
+void f0() {
+  for (; ;) ;
+}
+
+// CXX98-NOT: mustprogress
+// CXX11-NOT: mustprogress
+// FINITE-NOT: mustprogress
+// CHECK-LABEL: @_Z2f1v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    br label %for.cond
+// CHECK:       for.cond:
+// CHECK-NEXT:    br i1 true, label %for.body, label %for.end
+// CHECK:       for.body:
+// CHECK-NOT:    br {{.*}}, !llvm.loop
+// CHECK:       for.end:
+// CHECK-NEXT:    ret void
+//
+void f1() {
+  for (; 1;)
+    ;
+}
+
+// CXX98-NOT: mustprogress
+// CXX11:     mustprogress
+// FINITE-NOT: mustprogress
+// CHECK-LABEL: @_Z2f2v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    br label %for.cond
+// CHECK:       for.cond:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
+// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    br i1 [[CMP]], label %for.body, label %for.end
+// CHECK:       for.body:
+// CXX98-NOT:    br {{.*}}, !llvm.loop
+// CXX11:        br label %for.cond, !llvm.loop [[LOOP1:!.*]]
+// FINITE-NEXT:   br label %for.cond, !llvm.loop [[LOOP1:!.*]]
+// CHECK:       for.end:
+// CHECK-NEXT:    ret void
+//
+void f2() {
+  for (; a == b;)
+    ;
+}
+
+// CXX98-NOT: mustprogress
+// CXX11-NOT: mustprogress
+// FINITE-NOT: mustprogress
+// CHECK-LABEL: @_Z1Fv(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    br label %for.cond
+// CHECK:       for.cond:
+// CHECK-NEXT:    br i1 true, label %for.body, label %for.end
+// CHECK:       for.body:
+// CHECK-NOT:     br {{.*}}, !llvm.loop
+// CHECK:       for.end:
+// CHECK-NEXT:    br label %for.cond1
+// CHECK:       for.cond1:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
+// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    br i1 [[CMP]], label %for.body2, label %for.end3
+// CHECK:       for.body2:
+// CXX98-NOT:     br {{.*}}, !llvm.loop
+// CXX11-NEXT:    br label %for.cond1, !llvm.loop [[LOOP2:!.*]]
+// FINITE-NEXT:   br label %for.cond1, !llvm.loop [[LOOP2:!.*]]
+// CHECK:       for.end3:
+// CHECK-NEXT:    ret void
+//
+void F() {
+  for (; 1;)
+    ;
+  for (; a == b;)
+    ;
+}
+
+// CXX98-NOT: mustprogress
+// CXX11-NOT: mustprogress
+// FINITE-NOT: mustprogress
+// CHECK-LABEL: @_Z2F2v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    br label %for.cond
+// CHECK:       for.cond:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
+// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    br i1 [[CMP]], label %for.body, label %for.end
+// CHECK:       for.body:
+// CXX98_NOT:     br {{.*}} !llvm.loop
+// CXX11-NEXT:    br label %for.cond, !llvm.loop [[LOOP3:!.*]]
+// FINITE-NEXT:    br label %for.cond, !llvm.loop [[LOOP3:!.*]]
+// CHECK:       for.end:
+// CHECK-NEXT:    br label %for.cond1
+// CHECK:       for.cond1:
+// CHECK-NEXT:    br i1 true, label %for.body2, label %for.end3
+// CHECK:       for.body2:
+// CHECK-NOT:     br {{.*}}, !llvm.loop
+// CHECK:       for.end3:
+// CHECK-NEXT:    ret void
+//
+void F2() {
+  for (; a == b;)
+    ;
+  for (; 1;)
+    ;
+}
+
+// CXX98-NOT: mustprogress
+// CXX11-NOT: mustprogress
+// FINITE-NOT:     mustprogress
+// CHECK-LABEL: @_Z2w1v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    br label %while.body
+// CHECK:       while.body:
+// CHECK-NOT:     br {{.*}}, !llvm.loop
+//
+void w1() {
+  while (1)
+    ;
+}
+
+// CXX98-NOT: mustprogress
+// CXX11:     mustprogress
+// FINITE-NOT: mustprogress
+// CHECK-LABEL: @_Z2w2v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    br label %while.cond
+// CHECK:       while.cond:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
+// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    br i1 [[CMP]], label %while.body, label %while.end
+// CHECK:       while.body:
+// CXX98-NOT:     br {{.*}}, !llvm.loop
+// CXX11-NEXT:    br label %while.cond, !llvm.loop [[LOOP4:!.*]]
+// FINITE-NEXT:   br label %while.cond, !llvm.loop [[LOOP4:!.*]]
+// CHECK:       while.end:
+// CHECK-NEXT:    ret void
+//
+void w2() {
+  while (a == b)
+    ;
+}
+
+// CXX98-NOT: mustprogress
+// CXX11-NOT: mustprogress
+// FINITE-NOT: mustprogress
+// CHECK-LABEL: @_Z1Wv(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    br label %while.cond
+// CHECK:       while.cond:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
+// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    br i1 [[CMP]], label %while.body, label %while.end
+// CHECK:       while.body:
+// CXX98-NOT:     br {{.*}}, !llvm.loop
+// CXX11-NEXT:    br label %while.cond, !llvm.loop [[LOOP5:!.*]]
+// FINITE-NEXT:   br label %while.cond, !llvm.loop [[LOOP5:!.*]]
+// CHECK:       while.end:
+// CHECK-NEXT:    br label %while.body2
+// CHECK:       while.body2:
+// CHECK-NOT:     br {{.*}}, !llvm.loop
+//
+void W() {
+  while (a == b)
+    ;
+  while (1)
+    ;
+}
+
+// CXX98-NOT: mustprogress
+// CXX11-NOT: mustprogress
+// FINITE-NOT: mustprogress
+// CHECK-LABEL: @_Z2W2v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    br label %while.body
+// CHECK:       while.body:
+// CHECK-NOT:     br {{.*}}, !llvm.loop
+//
+void W2() {
+  while (1)
+    ;
+  while (a == b)
+    ;
+}
+
+// CXX98-NOT: mustprogress
+// CXX11-NOT: mustprogress
+// FINITE-NOT: mustprogress
+// CHECK-LABEL: @_Z2d1v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    br label %do.body
+// CHECK:       do.body:
+// CHECK-NEXT:    br label %do.cond
+// CHECK:       do.cond:
+// CHECK-NOT:     br {{.*}}, !llvm.loop
+// CHECK:       do.end:
+// CHECK-NEXT:    ret void
+//
+void d1() {
+  do
+    ;
+  while (1);
+}
+
+// CXX98-NOT: mustprogress
+// CXX11:     mustprogress
+// FINITE-NOT:  mustprogress
+// CHECK-LABEL: @_Z2d2v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    br label %do.body
+// CHECK:       do.body:
+// CHECK-NEXT:    br label %do.cond
+// CHECK:       do.cond:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
+// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
+// CXX98-NOT:     br {{.*}}, !llvm.loop
+// CXX11-NEXT:    br i1 [[CMP]], label %do.body, label %do.end, !llvm.loop [[LOOP6:!.*]]
+// FINITE-NEXT:   br i1 [[CMP]], label %do.body, label %do.end, !llvm.loop [[LOOP6:!.*]]
+// CHECK:       do.end:
+// CHECK-NEXT:    ret void
+//
+void d2() {
+  do
+    ;
+  while (a == b);
+}
+
+// CXX98-NOT: mustprogress
+// CXX11-NOT: mustprogress
+// FINITE-NOT:     mustprogress
+// CHECK-LABEL: @_Z1Dv(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    br label %do.body
+// CHECK:       do.body:
+// CHECK-NEXT:    br label %do.cond
+// CHECK:       do.cond:
+// CHECK-NOT:     br {{.*}}, !llvm.loop
+// CHECK:       do.end:
+// CHECK-NEXT:    br label %do.body1
+// CHECK:       do.body1:
+// CHECK-NEXT:    br label %do.cond2
+// CHECK:       do.cond2:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
+// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
+// CXX98-NOT:     br {{.*}}, !llvm.loop
+// CXX11-NEXT:    br i1 [[CMP]], label %do.body1, label %do.end3, !llvm.loop [[LOOP7:!.*]]
+// FINITE-NEXT:   br i1 [[CMP]], label %do.body1, label %do.end3, !llvm.loop [[LOOP7:!.*]]
+// CHECK:       do.end3:
+// CHECK-NEXT:    ret void
+//
+void D() {
+  do
+    ;
+  while (1);
+  do
+    ;
+  while (a == b);
+}
+
+// CXX98-NOT: mustprogress
+// CXX11-NOT: mustprogress
+// FINITE-NOT:     mustprogress
+// CHECK-LABEL: @_Z2D2v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    br label %do.body
+// CHECK:       do.body:
+// CHECK-NEXT:    br label %do.cond
+// CHECK:       do.cond:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
+// CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
+// CXX98-NOT:     br {{.*}}, !llvm.loop
+// CXX11-NEXT:    br i1 [[CMP]], label %do.body, label %do.end, !llvm.loop [[LOOP8:!.*]]
+// FINITE-NEXT:   br i1 [[CMP]], label %do.body, label %do.end, !llvm.loop [[LOOP8:!.*]]
+// CHECK:       do.end:
+// CHECK-NEXT:    br label %do.body1
+// CHECK:       do.body1:
+// CHECK-NEXT:    br label %do.cond2
+// CHECK:       do.cond2:
+// CHECK-NOT:     br {{.*}}, !llvm.loop
+// CHECK:       do.end3:
+// CHECK-NEXT:    ret void
+//
+void D2() {
+  do
+    ;
+  while (a == b);
+  do
+    ;
+  while (1);
+}
+
+// CXX11: [[LOOP1]] = distinct !{[[LOOP1]], [[MP:!.*]]}
+// CXX11: [[MP]] = !{!"llvm.loop.mustprogress"}
+// CXX11: [[LOOP2]] = distinct !{[[LOOP2]], [[MP]]}
+// CXX11: [[LOOP3]] = distinct !{[[LOOP3]], [[MP]]}
+// CXX11: [[LOOP4]] = distinct !{[[LOOP4]], [[MP]]}
+// CXX11: [[LOOP5]] = distinct !{[[LOOP5]], [[MP]]}
+// CXX11: [[LOOP6]] = distinct !{[[LOOP6]], [[MP]]}
+// CXX11: [[LOOP7]] = distinct !{[[LOOP7]], [[MP]]}
+// CXX11: [[LOOP8]] = distinct !{[[LOOP8]], [[MP]]}

From bdafd284b291436d3fa4644f585efe3b06363554 Mon Sep 17 00:00:00 2001
From: "William S. Moses" <gh@wsmoses.com>
Date: Wed, 17 Feb 2021 13:54:42 -0500
Subject: [PATCH 151/318] [SROA] Amend failing test from D95826

(cherry picked from commit 892d2822b62ebcaa7aa0b006b5ea4f26593c1618)
---
 llvm/test/Transforms/SROA/tbaa-struct2.ll | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/test/Transforms/SROA/tbaa-struct2.ll b/llvm/test/Transforms/SROA/tbaa-struct2.ll
index 75f72f4e9963..13075dd84326 100644
--- a/llvm/test/Transforms/SROA/tbaa-struct2.ll
+++ b/llvm/test/Transforms/SROA/tbaa-struct2.ll
@@ -35,8 +35,8 @@ define double @bar(%struct.Wishart* %wishart) {
 ; CHECK-NEXT:   %tmp.sroa.2.0.copyload = load i32, i32* %tmp.sroa.2.0.waddr.sroa_idx1, align 8, !tbaa.struct !7
 ; CHECK-NEXT:   %tmp.sroa.3.0.waddr.sroa_raw_cast = bitcast %struct.Wishart* %wishart to i8*
 ; CHECK-NEXT:   %tmp.sroa.3.0.waddr.sroa_raw_idx = getelementptr inbounds i8, i8* %tmp.sroa.3.0.waddr.sroa_raw_cast, i64 12
-; CHECK-NEXT:   %tmp.sroa.3.0.tmpaddr.sroa_idx = getelementptr inbounds [4 x i8], [4 x i8]* %tmp.sroa.3, i64 0, i64 0
-; CHECK-NEXT:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %tmp.sroa.3.0.tmpaddr.sroa_idx, i8* align 4 %tmp.sroa.3.0.waddr.sroa_raw_idx, i64 4, i1 false), !tbaa.struct !8
+; CHECK-NEXT:   %[[sroa_idx:.+]] = getelementptr inbounds [4 x i8], [4 x i8]* %tmp.sroa.3, i64 0, i64 0
+; CHECK-NEXT:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %[[sroa_idx]], i8* align 4 %tmp.sroa.3.0.waddr.sroa_raw_idx, i64 4, i1 false), !tbaa.struct !8
 ; CHECK-NEXT:   %call = call double @subcall(double %tmp.sroa.0.0.copyload, i32 %tmp.sroa.2.0.copyload)
 ; CHECK-NEXT:   ret double %call
 ; CHECK-NEXT: }
@@ -48,4 +48,4 @@ define double @bar(%struct.Wishart* %wishart) {
 ; CHECK: !5 = !{!6, !6, i64 0}
 ; CHECK: !6 = !{!"int", !{{[0-9]+}}, i64 0}
 ; CHECK: !7 = !{i64 0, i64 4, !5}
-; CHECK: !8 = !{}
\ No newline at end of file
+; CHECK: !8 = !{}

From ee7eaf860cde24a91a5b17390b8ad5eddd05f7f9 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 4 Feb 2021 09:07:44 -0800
Subject: [PATCH 152/318] [llvm-objdump] --source: drop the warning when there
 is no debug info

Warnings have been added for three cases (PR41905): (1) missing debug info, (2)
the source file cannot be found, (3) the debug info points at a line beyond the
end of the file.

(1) is probably less useful. This was brought up once on
http://lists.llvm.org/pipermail/llvm-dev/2020-April/141264.html and two
internal users mentioned it to me that it was annoying. (I personally
find the warning confusing, too.)

Users specify --source to get additional information if sources happen to be
available.  If sources are not available, it should be obvious as the output
will have no interleaved source lines. The warning can be especially annoying
when using llvm-objdump -S on a bunch of files.

This patch drops the warning when there is no debug info.
(If LLVMSymbolizer::symbolizeCode returns an `Error`, there will still be
an error. There is currently no test for an `Error` return value.
The only code path is probably a broken symbol table, but we probably already emit a warning
in that case)

`source-interleave-prefix.test` has an inappropriate "malformed" test - the test simply has no
.debug_* because new llc does not produce debug info when the filename is empty (invalid).
I have tried tampering the header of .debug_info/.debug_line but llvm-symbolizer does not warn.
This patch does not intend to add the missing test coverage.

Differential Revision: https://reviews.llvm.org/D88715

(cherry picked from commit eecbb1c77655d38c06e47cf32e2dcc72da45c517)
---
 .../X86/source-interleave-no-debug-info.test  |  6 ++--
 .../X86/source-interleave-prefix.test         |  9 ------
 llvm/tools/llvm-objdump/llvm-objdump.cpp      | 30 ++++++++-----------
 3 files changed, 14 insertions(+), 31 deletions(-)

diff --git a/llvm/test/tools/llvm-objdump/X86/source-interleave-no-debug-info.test b/llvm/test/tools/llvm-objdump/X86/source-interleave-no-debug-info.test
index 25deaa00243c..89b03d429bfa 100644
--- a/llvm/test/tools/llvm-objdump/X86/source-interleave-no-debug-info.test
+++ b/llvm/test/tools/llvm-objdump/X86/source-interleave-no-debug-info.test
@@ -1,15 +1,13 @@
 ## Test that if an object has no debug information, only the disassembly is
-## printed when --source is specified, and that we emit a warning.
+## printed when --source is specified, and that we do not emit a warning.
 
 # RUN: sed -e "s,SRC_COMPDIR,%/p/Inputs,g" %p/Inputs/source-interleave.ll > %t.ll
 # RUN: llc -o %t.o -filetype=obj -mtriple=x86_64-pc-linux %t.ll
 # RUN: llvm-objcopy --strip-debug %t.o %t2.o
 
 # RUN: llvm-objdump --source %t.o | FileCheck %s --check-prefixes=CHECK,SOURCE
-# RUN: llvm-objdump --source %t2.o 2> %t2.e | FileCheck %s --check-prefixes=CHECK --implicit-check-not='main()'
-# RUN: FileCheck %s --input-file %t2.e --check-prefixes=WARN
+# RUN: llvm-objdump --source %t2.o 2>&1 | FileCheck %s --check-prefixes=CHECK --implicit-check-not='main()' --implicit-check-not=warning:
 
-# WARN:        warning: '{{.*}}2.o': failed to parse debug information
 # CHECK:       0000000000000010 <main>:
 # SOURCE-NEXT: ; int main() {
 # CHECK-NEXT:   10:   55                      pushq   %rbp
diff --git a/llvm/test/tools/llvm-objdump/X86/source-interleave-prefix.test b/llvm/test/tools/llvm-objdump/X86/source-interleave-prefix.test
index b384c49b350e..23ce55a329ac 100644
--- a/llvm/test/tools/llvm-objdump/X86/source-interleave-prefix.test
+++ b/llvm/test/tools/llvm-objdump/X86/source-interleave-prefix.test
@@ -24,15 +24,6 @@
 ; RUN: llvm-objdump --prefix myprefix --source %t-correct-prefix.o 2>&1 | \
 ; RUN:   FileCheck %s --check-prefix=CHECK-BROKEN-PREFIX -DFILE=%t-correct-prefix.o -DPREFIX=myprefix%/p
 
-;; Test malformed input.
-
-; RUN: sed -e "s,SRC_COMPDIR,,g" -e "s,filename: \"source-interleave-x86_64.c\",filename: \"\",g" \
-; RUN:   %p/Inputs/source-interleave.ll > %t-malformed.ll
-; RUN: llc -o %t-malformed.o -filetype=obj -mtriple=x86_64-pc-linux %t-malformed.ll
-; RUN: llvm-objdump --prefix myprefix --source %t-malformed.o 2>&1 | \
-; RUN:   FileCheck %s --check-prefix=CHECK-MALFORMED -DFILE=%t-malformed.o
-; CHECK-MALFORMED: warning: '[[FILE]]': failed to parse debug information for [[FILE]]
-
 ;; Using only a prefix separator is the same as not using the `--prefix` option.
 
 ; RUN: llvm-objdump --prefix / --source %t-missing-prefix.o 2>&1 | \
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index 3134f989603a..17128e95727f 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -947,8 +947,8 @@ class SourcePrinter {
   std::unordered_map<std::string, std::vector<StringRef>> LineCache;
   // Keep track of missing sources.
   StringSet<> MissingSources;
-  // Only emit 'no debug info' warning once.
-  bool WarnedNoDebugInfo;
+  // Only emit 'invalid debug info' warning once.
+  bool WarnedInvalidDebugInfo = false;
 
 private:
   bool cacheSource(const DILineInfo& LineInfoFile);
@@ -962,8 +962,7 @@ class SourcePrinter {
 
 public:
   SourcePrinter() = default;
-  SourcePrinter(const ObjectFile *Obj, StringRef DefaultArch)
-      : Obj(Obj), WarnedNoDebugInfo(false) {
+  SourcePrinter(const ObjectFile *Obj, StringRef DefaultArch) : Obj(Obj) {
     symbolize::LLVMSymbolizer::Options SymbolizerOpts;
     SymbolizerOpts.PrintFunctions =
         DILineInfoSpecifier::FunctionNameKind::LinkageName;
@@ -1018,22 +1017,17 @@ void SourcePrinter::printSourceLine(formatted_raw_ostream &OS,
     return;
 
   DILineInfo LineInfo = DILineInfo();
-  auto ExpectedLineInfo = Symbolizer->symbolizeCode(*Obj, Address);
+  Expected<DILineInfo> ExpectedLineInfo =
+      Symbolizer->symbolizeCode(*Obj, Address);
   std::string ErrorMessage;
-  if (!ExpectedLineInfo)
-    ErrorMessage = toString(ExpectedLineInfo.takeError());
-  else
+  if (ExpectedLineInfo) {
     LineInfo = *ExpectedLineInfo;
-
-  if (LineInfo.FileName == DILineInfo::BadString) {
-    if (!WarnedNoDebugInfo) {
-      std::string Warning =
-          "failed to parse debug information for " + ObjectFilename.str();
-      if (!ErrorMessage.empty())
-        Warning += ": " + ErrorMessage;
-      reportWarning(Warning, ObjectFilename);
-      WarnedNoDebugInfo = true;
-    }
+  } else if (!WarnedInvalidDebugInfo) {
+    WarnedInvalidDebugInfo = true;
+    // TODO Untested.
+    reportWarning("failed to parse debug information: " +
+                      toString(ExpectedLineInfo.takeError()),
+                  ObjectFilename);
   }
 
   if (!Prefix.empty() && sys::path::is_absolute_gnu(LineInfo.FileName)) {

From 76d5d54f62599d249e0bf2d1b0998451a584c3f3 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Sun, 14 Feb 2021 12:25:56 -0600
Subject: [PATCH 153/318] Avoid use of stack allocations in asynchronous calls

NOTE: This is an adaption of the original patch to be applicable to the
      LLVM 12 release branch. Logic is the same though.

As reported by Guilherme Valarini [0], we used to pass stack allocations
to calls that can nowadays be asynchronous. This is arguably a problem
and it will inevitably result in UB. To remedy the situation we allocate
the locations as part of the AsyncInfoTy object. The lifetime of that
object matches what we need for now. If the synchronization is not tied
to the AsyncInfoTy object anymore we might need to have a different
buffer construct in global space.

This should be back-ported to LLVM 12 but needs slight modifications as
it is based on refactoring patches we do not need to backport.

[0] https://lists.llvm.org/pipermail/openmp-dev/2021-February/003867.html

Differential Revision: https://reviews.llvm.org/D96667
---
 openmp/libomptarget/include/omptarget.h | 10 ++++++++++
 openmp/libomptarget/src/omptarget.cpp   | 15 ++++++++++++---
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h
index 9c533944d135..46bb8206efa1 100644
--- a/openmp/libomptarget/include/omptarget.h
+++ b/openmp/libomptarget/include/omptarget.h
@@ -14,6 +14,8 @@
 #ifndef _OMPTARGET_H_
 #define _OMPTARGET_H_
 
+#include <deque>
+#include <stddef.h>
 #include <stdint.h>
 #include <stddef.h>
 
@@ -119,10 +121,18 @@ struct __tgt_target_table {
 /// This struct contains information exchanged between different asynchronous
 /// operations for device-dependent optimization and potential synchronization
 struct __tgt_async_info {
+  /// Locations we used in (potentially) asynchronous calls which should live
+  /// as long as this AsyncInfoTy object.
+  std::deque<void *> BufferLocations;
+
   // A pointer to a queue-like structure where offloading operations are issued.
   // We assume to use this structure to do synchronization. In CUDA backend, it
   // is CUstream.
   void *Queue = nullptr;
+
+  /// Return a void* reference with a lifetime that is at least as long as this
+  /// AsyncInfoTy object. The location can be used as intermediate buffer.
+  void *&getVoidPtrLocation();
 };
 
 /// This struct is a record of non-contiguous information
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index e4b7b18bc70b..37150aae2fe6 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -18,6 +18,13 @@
 #include <cassert>
 #include <vector>
 
+/// Return a void* reference with a lifetime that is at least as long as this
+/// AsyncInfoTy object. The location can be used as intermediate buffer.
+void *&__tgt_async_info::getVoidPtrLocation() {
+  BufferLocations.push_back(nullptr);
+  return BufferLocations.back();
+}
+
 /* All begin addresses for partially mapped structs must be 8-aligned in order
  * to ensure proper alignment of members. E.g.
  *
@@ -415,7 +422,8 @@ int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
       DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
          DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
       uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
-      void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
+      void *&TgtPtrBase = async_info_ptr->getVoidPtrLocation();
+      TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
       int rt = Device.submitData(PointerTgtPtrBegin, &TgtPtrBase,
                                  sizeof(void *), async_info_ptr);
       if (rt != OFFLOAD_SUCCESS) {
@@ -1122,8 +1130,9 @@ static int processDataBefore(ident_t *loc, int64_t DeviceId, void *HostPtr,
         DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase));
         uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
         void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta);
-        void *PointerTgtPtrBegin = Device.getTgtPtrBegin(
-            HstPtrVal, ArgSizes[I], IsLast, false, IsHostPtr);
+        void *&PointerTgtPtrBegin = AsyncInfo->getVoidPtrLocation();
+        PointerTgtPtrBegin = Device.getTgtPtrBegin(HstPtrVal, ArgSizes[I],
+                                                   IsLast, false, IsHostPtr);
         if (!PointerTgtPtrBegin) {
           DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n",
              DPxPTR(HstPtrVal));

From a3545a0b0777da773c5e2370622579c44a8f0f63 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Fri, 19 Feb 2021 09:06:05 -0500
Subject: [PATCH 154/318] [Analysis][LoopVectorize] do not form reductions of
 pointers

This is a fix for https://llvm.org/PR49215 either before/after
we make a verifier enhancement for vector reductions with D96904.

I'm not sure what the current thinking is for pointer math/logic
in IR. We allow icmp on pointer values. Therefore, we match min/max
patterns, so without this patch, the vectorizer could form a vector
reduction from that sequence.

But the LangRef definitions for min/max and vector reduction
intrinsics do not allow pointer types:
https://llvm.org/docs/LangRef.html#llvm-smax-intrinsic
https://llvm.org/docs/LangRef.html#llvm-vector-reduce-umax-intrinsic

So we would crash/assert at some point - either in IR verification,
in the cost model, or in codegen. If we do want to allow this kind
of transform, we will need to update the LangRef and all of those
parts of the compiler.

Differential Revision: https://reviews.llvm.org/D97047

(cherry picked from commit 5b250a27ec7822aa0a32abb696cb16c2cc60149c)
---
 llvm/lib/Analysis/IVDescriptors.cpp           |  5 ++-
 .../Transforms/LoopVectorize/reduction-ptr.ll | 40 +++++++++++++++++++
 2 files changed, 44 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/reduction-ptr.ll

diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 7f311d8f9a2b..94a24ccf2155 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -243,11 +243,14 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
   if (RecurrenceType->isFloatingPointTy()) {
     if (!isFloatingPointRecurrenceKind(Kind))
       return false;
-  } else {
+  } else if (RecurrenceType->isIntegerTy()) {
     if (!isIntegerRecurrenceKind(Kind))
       return false;
     if (isArithmeticRecurrenceKind(Kind))
       Start = lookThroughAnd(Phi, RecurrenceType, VisitedInsts, CastInsts);
+  } else {
+    // Pointer min/max may exist, but it is not supported as a reduction op.
+    return false;
   }
 
   Worklist.push_back(Start);
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-ptr.ll b/llvm/test/Transforms/LoopVectorize/reduction-ptr.ll
new file mode 100644
index 000000000000..5cae61638f31
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/reduction-ptr.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -S | FileCheck %s
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+; Reductions of pointer types are not supported.
+
+define void @PR49215(i32* %p, i32* %q) {
+; CHECK-LABEL: @PR49215(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[G:%.*]] = phi i32* [ [[P:%.*]], [[ENTRY]] ], [ [[UMIN:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32* [[Q:%.*]], [[G]]
+; CHECK-NEXT:    [[UMIN]] = select i1 [[CMP2]], i32* [[Q]], i32* [[G]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], undef
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       loopexit:
+; CHECK-NEXT:    [[UMIN_LCSSA:%.*]] = phi i32* [ [[UMIN]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[PHI_CAST:%.*]] = ptrtoint i32* [[UMIN_LCSSA]] to i64
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %g = phi i32* [ %p, %entry ], [ %umin, %for.body ]
+  %cmp2 = icmp ult i32* %q, %g
+  %umin = select i1 %cmp2, i32* %q, i32* %g
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, undef
+  br i1 %exitcond, label %loopexit, label %for.body
+
+loopexit:
+  %phi.cast = ptrtoint i32* %umin to i64
+  ret void
+}

From 3444f052006ca2b19052a4599dd9001b01088c25 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Sat, 20 Feb 2021 20:43:16 -0500
Subject: [PATCH 155/318] [clang][Driver][OpenBSD] libcxx also requires pthread

(cherry picked from commit b42d57a100c5df6ace68f686f5adaabeafe8a0f6)
---
 clang/lib/Driver/ToolChains/OpenBSD.cpp | 1 +
 clang/test/Driver/openbsd.cpp           | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/OpenBSD.cpp b/clang/lib/Driver/ToolChains/OpenBSD.cpp
index f155d74632f9..e162165b2561 100644
--- a/clang/lib/Driver/ToolChains/OpenBSD.cpp
+++ b/clang/lib/Driver/ToolChains/OpenBSD.cpp
@@ -296,6 +296,7 @@ void OpenBSD::AddCXXStdlibLibArgs(const ArgList &Args,
 
   CmdArgs.push_back(Profiling ? "-lc++_p" : "-lc++");
   CmdArgs.push_back(Profiling ? "-lc++abi_p" : "-lc++abi");
+  CmdArgs.push_back(Profiling ? "-lpthread_p" : "-lpthread");
 }
 
 std::string OpenBSD::getCompilerRT(const ArgList &Args,
diff --git a/clang/test/Driver/openbsd.cpp b/clang/test/Driver/openbsd.cpp
index 9293148680c8..23c365d28e7e 100644
--- a/clang/test/Driver/openbsd.cpp
+++ b/clang/test/Driver/openbsd.cpp
@@ -6,7 +6,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-CXX %s
 // RUN: %clangxx %s -### -o %t.o -target arm-unknown-openbsd 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-CXX %s
-// CHECK-CXX: "-lc++" "-lc++abi" "-lm"
+// CHECK-CXX: "-lc++" "-lc++abi" "-lpthread" "-lm"
 
 // RUN: %clangxx %s -### -pg -o %t.o -target amd64-pc-openbsd 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PG-CXX %s
@@ -16,4 +16,4 @@
 // RUN:   | FileCheck --check-prefix=CHECK-PG-CXX %s
 // RUN: %clangxx %s -### -pg -o %t.o -target arm-unknown-openbsd 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PG-CXX %s
-// CHECK-PG-CXX: "-lc++_p" "-lc++abi_p" "-lm_p"
+// CHECK-PG-CXX: "-lc++_p" "-lc++abi_p" "-lpthread_p" "-lm_p"

From 76e4c93ea42b3d23907611d14e347bfeae8d4b0a Mon Sep 17 00:00:00 2001
From: Conrad Poelman <cpgithub@stellarscience.com>
Date: Tue, 2 Feb 2021 05:59:38 +0100
Subject: [PATCH 156/318] clang-extra: fix incorrect use of std::lock_guard by
 adding variable name (identified by MSVC [[nodiscard]] error)

`std::lock_guard` is an RAII class that needs a variable name whose scope determines the guard's lifetime. This particular usage lacked a variable name, meaning the guard could be destroyed before the line that it was indented to protect.

This line was identified by building clang with the latest MSVC preview release, which declares the std::lock_guard constructor to be `[[nodiscard]]` to draw attention to such issues.

Reviewed By: kadircet

Differential Revision: https://reviews.llvm.org/D95725

(cherry picked from commit 0b70c86e2007d3f32968f0a7d9efe8eab3bf0f0a)
---
 clang-tools-extra/clangd/support/Function.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang-tools-extra/clangd/support/Function.h b/clang-tools-extra/clangd/support/Function.h
index 2cac1b1e7f67..936800d56985 100644
--- a/clang-tools-extra/clangd/support/Function.h
+++ b/clang-tools-extra/clangd/support/Function.h
@@ -51,7 +51,7 @@ template <typename T> class Event {
     Subscription &operator=(Subscription &&Other) {
       // If *this is active, unsubscribe.
       if (Parent) {
-        std::lock_guard<std::recursive_mutex>(Parent->ListenersMu);
+        std::lock_guard<std::recursive_mutex> Lock(Parent->ListenersMu);
         llvm::erase_if(Parent->Listeners,
                        [&](const std::pair<Listener, unsigned> &P) {
                          return P.second == ListenerID;

From 8eeb3d99933a3246f2d850b807cf54f11a3a8dce Mon Sep 17 00:00:00 2001
From: Sam McCall <sam.mccall@gmail.com>
Date: Sun, 31 Jan 2021 13:53:22 +0100
Subject: [PATCH 157/318] [clangd] Rename: merge index/AST refs
 path-insensitively where needed

If you have c:\foo open, and C:\foo indexed (case difference) then these
need to be considered the same file. Otherwise we emit edits to both,
and editors do... something that isn't pretty.

Maybe more centralized normalization is called for, but it's not trivial
to do this while also being case-preserving. see
https://github.com/clangd/clangd/issues/108

Fixes https://github.com/clangd/clangd/issues/665

Differential Revision: https://reviews.llvm.org/D95759

(cherry picked from commit b63cd4db915c08e0cb4cf668a18de24b67f2c44c)
---
 .../clangd/GlobalCompilationDatabase.cpp      | 14 ------
 clang-tools-extra/clangd/refactor/Rename.cpp  |  4 +-
 .../clangd/support/CMakeLists.txt             |  1 +
 clang-tools-extra/clangd/support/Path.cpp     | 30 ++++++++++++
 clang-tools-extra/clangd/support/Path.h       |  6 +++
 .../clangd/unittests/RenameTests.cpp          | 46 +++++++++++++++++++
 6 files changed, 85 insertions(+), 16 deletions(-)
 create mode 100644 clang-tools-extra/clangd/support/Path.cpp

diff --git a/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp b/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp
index 542d0c3e4dbc..a38c8a57d161 100644
--- a/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp
+++ b/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp
@@ -396,20 +396,6 @@ DirectoryBasedGlobalCompilationDatabase::getCompileCommand(PathRef File) const {
   return None;
 }
 
-// For platforms where paths are case-insensitive (but case-preserving),
-// we need to do case-insensitive comparisons and use lowercase keys.
-// FIXME: Make Path a real class with desired semantics instead.
-//        This class is not the only place this problem exists.
-// FIXME: Mac filesystems default to case-insensitive, but may be sensitive.
-
-static std::string maybeCaseFoldPath(PathRef Path) {
-#if defined(_WIN32) || defined(__APPLE__)
-  return Path.lower();
-#else
-  return std::string(Path);
-#endif
-}
-
 std::vector<DirectoryBasedGlobalCompilationDatabase::DirectoryCache *>
 DirectoryBasedGlobalCompilationDatabase::getDirectoryCaches(
     llvm::ArrayRef<llvm::StringRef> Dirs) const {
diff --git a/clang-tools-extra/clangd/refactor/Rename.cpp b/clang-tools-extra/clangd/refactor/Rename.cpp
index d3c7da96a441..a857b3479871 100644
--- a/clang-tools-extra/clangd/refactor/Rename.cpp
+++ b/clang-tools-extra/clangd/refactor/Rename.cpp
@@ -68,7 +68,7 @@ llvm::Optional<std::string> getOtherRefFile(const Decl &D, StringRef MainFile,
     if (OtherFile)
       return;
     if (auto RefFilePath = filePath(R.Location, /*HintFilePath=*/MainFile)) {
-      if (*RefFilePath != MainFile)
+      if (!pathEqual(*RefFilePath, MainFile))
         OtherFile = *RefFilePath;
     }
   });
@@ -474,7 +474,7 @@ findOccurrencesOutsideFile(const NamedDecl &RenameDecl,
     if ((R.Kind & RefKind::Spelled) == RefKind::Unknown)
       return;
     if (auto RefFilePath = filePath(R.Location, /*HintFilePath=*/MainFile)) {
-      if (*RefFilePath != MainFile)
+      if (!pathEqual(*RefFilePath, MainFile))
         AffectedFiles[*RefFilePath].push_back(toRange(R.Location));
     }
   });
diff --git a/clang-tools-extra/clangd/support/CMakeLists.txt b/clang-tools-extra/clangd/support/CMakeLists.txt
index f0fe073eb136..fc7d7a28117b 100644
--- a/clang-tools-extra/clangd/support/CMakeLists.txt
+++ b/clang-tools-extra/clangd/support/CMakeLists.txt
@@ -23,6 +23,7 @@ add_clang_library(clangdSupport
   Logger.cpp
   Markup.cpp
   MemoryTree.cpp
+  Path.cpp
   Shutdown.cpp
   Threading.cpp
   ThreadsafeFS.cpp
diff --git a/clang-tools-extra/clangd/support/Path.cpp b/clang-tools-extra/clangd/support/Path.cpp
new file mode 100644
index 000000000000..f72d00070f34
--- /dev/null
+++ b/clang-tools-extra/clangd/support/Path.cpp
@@ -0,0 +1,30 @@
+//===--- Path.cpp -------------------------------------------*- C++-*------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "support/Path.h"
+namespace clang {
+namespace clangd {
+
+std::string maybeCaseFoldPath(PathRef Path) {
+#if defined(_WIN32) || defined(__APPLE__)
+  return Path.lower();
+#else
+  return std::string(Path);
+#endif
+}
+
+bool pathEqual(PathRef A, PathRef B) {
+#if defined(_WIN32) || defined(__APPLE__)
+  return A.equals_lower(B);
+#else
+  return A == B;
+#endif
+}
+
+} // namespace clangd
+} // namespace clang
diff --git a/clang-tools-extra/clangd/support/Path.h b/clang-tools-extra/clangd/support/Path.h
index 4d4ad7f49047..402903130f01 100644
--- a/clang-tools-extra/clangd/support/Path.h
+++ b/clang-tools-extra/clangd/support/Path.h
@@ -22,6 +22,12 @@ using Path = std::string;
 /// signatures.
 using PathRef = llvm::StringRef;
 
+// For platforms where paths are case-insensitive (but case-preserving),
+// we need to do case-insensitive comparisons and use lowercase keys.
+// FIXME: Make Path a real class with desired semantics instead.
+std::string maybeCaseFoldPath(PathRef Path);
+bool pathEqual(PathRef, PathRef);
+
 } // namespace clangd
 } // namespace clang
 
diff --git a/clang-tools-extra/clangd/unittests/RenameTests.cpp b/clang-tools-extra/clangd/unittests/RenameTests.cpp
index 4bc03796bb2b..e25850a68fe9 100644
--- a/clang-tools-extra/clangd/unittests/RenameTests.cpp
+++ b/clang-tools-extra/clangd/unittests/RenameTests.cpp
@@ -1067,6 +1067,52 @@ TEST(RenameTest, Renameable) {
   }
 }
 
+MATCHER_P(newText, T, "") { return arg.newText == T; }
+
+TEST(RenameTest, IndexMergeMainFile) {
+  Annotations Code("int ^x();");
+  TestTU TU = TestTU::withCode(Code.code());
+  TU.Filename = "main.cc";
+  auto AST = TU.build();
+
+  auto Main = testPath("main.cc");
+
+  auto Rename = [&](const SymbolIndex *Idx) {
+    auto GetDirtyBuffer = [&](PathRef Path) -> llvm::Optional<std::string> {
+      return Code.code().str(); // Every file has the same content.
+    };
+    RenameOptions Opts;
+    Opts.AllowCrossFile = true;
+    RenameInputs Inputs{Code.point(), "xPrime", AST,           Main,
+                        Idx,          Opts,     GetDirtyBuffer};
+    auto Results = rename(Inputs);
+    EXPECT_TRUE(bool(Results)) << llvm::toString(Results.takeError());
+    return std::move(*Results);
+  };
+
+  // We do not expect to see duplicated edits from AST vs index.
+  auto Results = Rename(TU.index().get());
+  EXPECT_THAT(Results.GlobalChanges.keys(), ElementsAre(Main));
+  EXPECT_THAT(Results.GlobalChanges[Main].asTextEdits(),
+              ElementsAre(newText("xPrime")));
+
+  // Sanity check: we do expect to see index results!
+  TU.Filename = "other.cc";
+  Results = Rename(TU.index().get());
+  EXPECT_THAT(Results.GlobalChanges.keys(),
+              UnorderedElementsAre(Main, testPath("other.cc")));
+
+#if defined(_WIN32) || defined(__APPLE__)
+  // On case-insensitive systems, no duplicates if AST vs index case differs.
+  // https://github.com/clangd/clangd/issues/665
+  TU.Filename = "MAIN.CC";
+  Results = Rename(TU.index().get());
+  EXPECT_THAT(Results.GlobalChanges.keys(), ElementsAre(Main));
+  EXPECT_THAT(Results.GlobalChanges[Main].asTextEdits(),
+              ElementsAre(newText("xPrime")));
+#endif
+}
+
 TEST(RenameTest, MainFileReferencesOnly) {
   // filter out references not from main file.
   llvm::StringRef Test =

From d8404633401509936600b60274b72fc03f11f040 Mon Sep 17 00:00:00 2001
From: Kadir Cetinkaya <kadircet@google.com>
Date: Mon, 15 Feb 2021 09:00:49 +0100
Subject: [PATCH 158/318] [clangd] Treat paths case-insensitively depending on
 the platform

Path{Match,Exclude} and MountPoint were checking paths case-sensitively
on all platforms, as with other features, this was causing problems on
windows. Since users can have capital drive letters on config files, but
editors might lower-case them.

This patch addresses that issue by:
- Creating regexes with case-insensitive matching on those platforms.
- Introducing a new pathIsAncestor helper, which performs checks in a
  case-correct manner where needed.

Differential Revision: https://reviews.llvm.org/D96690

(cherry picked from commit ecea7218fb9b994b26471e9877851cdb51a5f1d4)
---
 clang-tools-extra/clangd/ConfigCompile.cpp    | 20 +++++++---
 clang-tools-extra/clangd/support/Path.cpp     | 37 ++++++++++++-------
 clang-tools-extra/clangd/support/Path.h       | 12 ++++++
 .../clangd/unittests/CMakeLists.txt           |  1 +
 .../clangd/unittests/ConfigCompileTests.cpp   | 36 ++++++++++++++++++
 .../clangd/unittests/RenameTests.cpp          |  2 +-
 .../clangd/unittests/support/PathTests.cpp    | 36 ++++++++++++++++++
 7 files changed, 124 insertions(+), 20 deletions(-)
 create mode 100644 clang-tools-extra/clangd/unittests/support/PathTests.cpp

diff --git a/clang-tools-extra/clangd/ConfigCompile.cpp b/clang-tools-extra/clangd/ConfigCompile.cpp
index 8682cae36f26..dadc578c3a81 100644
--- a/clang-tools-extra/clangd/ConfigCompile.cpp
+++ b/clang-tools-extra/clangd/ConfigCompile.cpp
@@ -31,6 +31,7 @@
 #include "Features.inc"
 #include "TidyProvider.h"
 #include "support/Logger.h"
+#include "support/Path.h"
 #include "support/Trace.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
@@ -101,9 +102,11 @@ struct FragmentCompiler {
   // Normalized Fragment::SourceInfo::Directory.
   std::string FragmentDirectory;
 
-  llvm::Optional<llvm::Regex> compileRegex(const Located<std::string> &Text) {
+  llvm::Optional<llvm::Regex>
+  compileRegex(const Located<std::string> &Text,
+               llvm::Regex::RegexFlags Flags = llvm::Regex::NoFlags) {
     std::string Anchored = "^(" + *Text + ")$";
-    llvm::Regex Result(Anchored);
+    llvm::Regex Result(Anchored, Flags);
     std::string RegexError;
     if (!Result.isValid(RegexError)) {
       diag(Error, "Invalid regex " + Anchored + ": " + RegexError, Text.Range);
@@ -195,9 +198,15 @@ struct FragmentCompiler {
     if (F.HasUnrecognizedCondition)
       Out.Conditions.push_back([&](const Params &) { return false; });
 
+#ifdef CLANGD_PATH_CASE_INSENSITIVE
+    llvm::Regex::RegexFlags Flags = llvm::Regex::IgnoreCase;
+#else
+    llvm::Regex::RegexFlags Flags = llvm::Regex::NoFlags;
+#endif
+
     auto PathMatch = std::make_unique<std::vector<llvm::Regex>>();
     for (auto &Entry : F.PathMatch) {
-      if (auto RE = compileRegex(Entry))
+      if (auto RE = compileRegex(Entry, Flags))
         PathMatch->push_back(std::move(*RE));
     }
     if (!PathMatch->empty()) {
@@ -218,7 +227,7 @@ struct FragmentCompiler {
 
     auto PathExclude = std::make_unique<std::vector<llvm::Regex>>();
     for (auto &Entry : F.PathExclude) {
-      if (auto RE = compileRegex(Entry))
+      if (auto RE = compileRegex(Entry, Flags))
         PathExclude->push_back(std::move(*RE));
     }
     if (!PathExclude->empty()) {
@@ -349,7 +358,8 @@ struct FragmentCompiler {
       return;
     Spec.MountPoint = std::move(*AbsPath);
     Out.Apply.push_back([Spec(std::move(Spec))](const Params &P, Config &C) {
-      if (!P.Path.startswith(Spec.MountPoint))
+      if (P.Path.empty() || !pathStartsWith(Spec.MountPoint, P.Path,
+                                            llvm::sys::path::Style::posix))
         return;
       C.Index.External = Spec;
       // Disable background indexing for the files under the mountpoint.
diff --git a/clang-tools-extra/clangd/support/Path.cpp b/clang-tools-extra/clangd/support/Path.cpp
index f72d00070f34..6fc74b92fc7a 100644
--- a/clang-tools-extra/clangd/support/Path.cpp
+++ b/clang-tools-extra/clangd/support/Path.cpp
@@ -7,24 +7,33 @@
 //===----------------------------------------------------------------------===//
 
 #include "support/Path.h"
+#include "llvm/Support/Path.h"
 namespace clang {
 namespace clangd {
 
-std::string maybeCaseFoldPath(PathRef Path) {
-#if defined(_WIN32) || defined(__APPLE__)
-  return Path.lower();
-#else
-  return std::string(Path);
-#endif
-}
+#ifdef CLANGD_PATH_CASE_INSENSITIVE
+std::string maybeCaseFoldPath(PathRef Path) { return Path.lower(); }
+bool pathEqual(PathRef A, PathRef B) { return A.equals_lower(B); }
+#else  // NOT CLANGD_PATH_CASE_INSENSITIVE
+std::string maybeCaseFoldPath(PathRef Path) { return Path.str(); }
+bool pathEqual(PathRef A, PathRef B) { return A == B; }
+#endif // CLANGD_PATH_CASE_INSENSITIVE
 
-bool pathEqual(PathRef A, PathRef B) {
-#if defined(_WIN32) || defined(__APPLE__)
-  return A.equals_lower(B);
-#else
-  return A == B;
-#endif
+bool pathStartsWith(PathRef Ancestor, PathRef Path,
+                    llvm::sys::path::Style Style) {
+  assert(llvm::sys::path::is_absolute(Ancestor, Style) &&
+         llvm::sys::path::is_absolute(Path, Style));
+  // If ancestor ends with a separator drop that, so that we can match /foo/ as
+  // a parent of /foo.
+  if (llvm::sys::path::is_separator(Ancestor.back(), Style))
+    Ancestor = Ancestor.drop_back();
+  // Ensure Path starts with Ancestor.
+  if (!pathEqual(Ancestor, Path.take_front(Ancestor.size())))
+    return false;
+  Path = Path.drop_front(Ancestor.size());
+  // Then make sure either two paths are equal or Path has a separator
+  // afterwards.
+  return Path.empty() || llvm::sys::path::is_separator(Path.front(), Style);
 }
-
 } // namespace clangd
 } // namespace clang
diff --git a/clang-tools-extra/clangd/support/Path.h b/clang-tools-extra/clangd/support/Path.h
index 402903130f01..938d7d7e99c9 100644
--- a/clang-tools-extra/clangd/support/Path.h
+++ b/clang-tools-extra/clangd/support/Path.h
@@ -10,8 +10,14 @@
 #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_SUPPORT_PATH_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Path.h"
 #include <string>
 
+/// Whether current platform treats paths case insensitively.
+#if defined(_WIN32) || defined(__APPLE__)
+#define CLANGD_PATH_CASE_INSENSITIVE
+#endif
+
 namespace clang {
 namespace clangd {
 
@@ -28,6 +34,12 @@ using PathRef = llvm::StringRef;
 std::string maybeCaseFoldPath(PathRef Path);
 bool pathEqual(PathRef, PathRef);
 
+/// Checks if \p Ancestor is a proper ancestor of \p Path. This is just a
+/// smarter lexical prefix match, e.g: foo/bar/baz doesn't start with foo/./bar.
+/// Both \p Ancestor and \p Path must be absolute.
+bool pathStartsWith(
+    PathRef Ancestor, PathRef Path,
+    llvm::sys::path::Style Style = llvm::sys::path::Style::native);
 } // namespace clangd
 } // namespace clang
 
diff --git a/clang-tools-extra/clangd/unittests/CMakeLists.txt b/clang-tools-extra/clangd/unittests/CMakeLists.txt
index f4d364720eaf..c396c6f5873b 100644
--- a/clang-tools-extra/clangd/unittests/CMakeLists.txt
+++ b/clang-tools-extra/clangd/unittests/CMakeLists.txt
@@ -104,6 +104,7 @@ add_unittest(ClangdUnitTests ClangdTests
   support/FunctionTests.cpp
   support/MarkupTests.cpp
   support/MemoryTreeTests.cpp
+  support/PathTests.cpp
   support/ThreadingTests.cpp
   support/TestTracer.cpp
   support/TraceTests.cpp
diff --git a/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp b/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp
index 4b1da2035727..d9aa171f3102 100644
--- a/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp
+++ b/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp
@@ -99,6 +99,25 @@ TEST_F(ConfigCompileTests, Condition) {
   Frag.If.PathMatch.emplace_back("ba*r");
   EXPECT_FALSE(compileAndApply());
   EXPECT_THAT(Diags.Diagnostics, IsEmpty());
+
+  // Only matches case-insensitively.
+  Frag = {};
+  Frag.If.PathMatch.emplace_back("B.*R");
+  EXPECT_THAT(Diags.Diagnostics, IsEmpty());
+#ifdef CLANGD_PATH_CASE_INSENSITIVE
+  EXPECT_TRUE(compileAndApply());
+#else
+  EXPECT_FALSE(compileAndApply());
+#endif
+
+  Frag = {};
+  Frag.If.PathExclude.emplace_back("B.*R");
+  EXPECT_THAT(Diags.Diagnostics, IsEmpty());
+#ifdef CLANGD_PATH_CASE_INSENSITIVE
+  EXPECT_FALSE(compileAndApply());
+#else
+  EXPECT_TRUE(compileAndApply());
+#endif
 }
 
 TEST_F(ConfigCompileTests, CompileCommands) {
@@ -406,6 +425,23 @@ TEST_F(ConfigCompileTests, ExternalBlockMountPoint) {
   ASSERT_THAT(Diags.Diagnostics, IsEmpty());
   ASSERT_TRUE(Conf.Index.External);
   EXPECT_THAT(Conf.Index.External->MountPoint, FooPath);
+
+  // Only matches case-insensitively.
+  BazPath = testPath("fOo/baz.h", llvm::sys::path::Style::posix);
+  BazPath = llvm::sys::path::convert_to_slash(BazPath);
+  Parm.Path = BazPath;
+
+  FooPath = testPath("FOO/", llvm::sys::path::Style::posix);
+  FooPath = llvm::sys::path::convert_to_slash(FooPath);
+  Frag = GetFrag("", FooPath.c_str());
+  compileAndApply();
+  ASSERT_THAT(Diags.Diagnostics, IsEmpty());
+#ifdef CLANGD_PATH_CASE_INSENSITIVE
+  ASSERT_TRUE(Conf.Index.External);
+  EXPECT_THAT(Conf.Index.External->MountPoint, FooPath);
+#else
+  ASSERT_FALSE(Conf.Index.External);
+#endif
 }
 } // namespace
 } // namespace config
diff --git a/clang-tools-extra/clangd/unittests/RenameTests.cpp b/clang-tools-extra/clangd/unittests/RenameTests.cpp
index e25850a68fe9..b2c83a1a4303 100644
--- a/clang-tools-extra/clangd/unittests/RenameTests.cpp
+++ b/clang-tools-extra/clangd/unittests/RenameTests.cpp
@@ -1102,7 +1102,7 @@ TEST(RenameTest, IndexMergeMainFile) {
   EXPECT_THAT(Results.GlobalChanges.keys(),
               UnorderedElementsAre(Main, testPath("other.cc")));
 
-#if defined(_WIN32) || defined(__APPLE__)
+#ifdef CLANGD_PATH_CASE_INSENSITIVE
   // On case-insensitive systems, no duplicates if AST vs index case differs.
   // https://github.com/clangd/clangd/issues/665
   TU.Filename = "MAIN.CC";
diff --git a/clang-tools-extra/clangd/unittests/support/PathTests.cpp b/clang-tools-extra/clangd/unittests/support/PathTests.cpp
new file mode 100644
index 000000000000..26b999d103a0
--- /dev/null
+++ b/clang-tools-extra/clangd/unittests/support/PathTests.cpp
@@ -0,0 +1,36 @@
+//===-- PathTests.cpp -------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "support/Path.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace clang {
+namespace clangd {
+namespace {
+TEST(PathTests, IsAncestor) {
+  EXPECT_TRUE(pathStartsWith("/foo", "/foo"));
+  EXPECT_TRUE(pathStartsWith("/foo/", "/foo"));
+
+  EXPECT_FALSE(pathStartsWith("/foo", "/fooz"));
+  EXPECT_FALSE(pathStartsWith("/foo/", "/fooz"));
+
+  EXPECT_TRUE(pathStartsWith("/foo", "/foo/bar"));
+  EXPECT_TRUE(pathStartsWith("/foo/", "/foo/bar"));
+
+#ifdef CLANGD_PATH_CASE_INSENSITIVE
+  EXPECT_TRUE(pathStartsWith("/fOo", "/foo/bar"));
+  EXPECT_TRUE(pathStartsWith("/foo", "/fOo/bar"));
+#else
+  EXPECT_FALSE(pathStartsWith("/fOo", "/foo/bar"));
+  EXPECT_FALSE(pathStartsWith("/foo", "/fOo/bar"));
+#endif
+}
+} // namespace
+} // namespace clangd
+} // namespace clang

From b60110090a942078bbacf71db166c2353c340413 Mon Sep 17 00:00:00 2001
From: Kadir Cetinkaya <kadircet@google.com>
Date: Tue, 16 Feb 2021 20:57:00 +0100
Subject: [PATCH 159/318] [clangd] Fix windows buildbots after
 ecea7218fb9b994b26471e9877851cdb51a5f1d4

(cherry picked from commit cdef5a7161767c2c4b3b7cb2542cf1d29b6d4a09)
---
 clang-tools-extra/clangd/support/Path.cpp     |  4 ++--
 .../clangd/unittests/support/PathTests.cpp    | 21 ++++++++++---------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/clang-tools-extra/clangd/support/Path.cpp b/clang-tools-extra/clangd/support/Path.cpp
index 6fc74b92fc7a..a7907cffe60c 100644
--- a/clang-tools-extra/clangd/support/Path.cpp
+++ b/clang-tools-extra/clangd/support/Path.cpp
@@ -21,8 +21,8 @@ bool pathEqual(PathRef A, PathRef B) { return A == B; }
 
 bool pathStartsWith(PathRef Ancestor, PathRef Path,
                     llvm::sys::path::Style Style) {
-  assert(llvm::sys::path::is_absolute(Ancestor, Style) &&
-         llvm::sys::path::is_absolute(Path, Style));
+  assert(llvm::sys::path::is_absolute(Ancestor) &&
+         llvm::sys::path::is_absolute(Path));
   // If ancestor ends with a separator drop that, so that we can match /foo/ as
   // a parent of /foo.
   if (llvm::sys::path::is_separator(Ancestor.back(), Style))
diff --git a/clang-tools-extra/clangd/unittests/support/PathTests.cpp b/clang-tools-extra/clangd/unittests/support/PathTests.cpp
index 26b999d103a0..599c76926d30 100644
--- a/clang-tools-extra/clangd/unittests/support/PathTests.cpp
+++ b/clang-tools-extra/clangd/unittests/support/PathTests.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "TestFS.h"
 #include "support/Path.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
@@ -14,21 +15,21 @@ namespace clang {
 namespace clangd {
 namespace {
 TEST(PathTests, IsAncestor) {
-  EXPECT_TRUE(pathStartsWith("/foo", "/foo"));
-  EXPECT_TRUE(pathStartsWith("/foo/", "/foo"));
+  EXPECT_TRUE(pathStartsWith(testPath("foo"), testPath("foo")));
+  EXPECT_TRUE(pathStartsWith(testPath("foo/"), testPath("foo")));
 
-  EXPECT_FALSE(pathStartsWith("/foo", "/fooz"));
-  EXPECT_FALSE(pathStartsWith("/foo/", "/fooz"));
+  EXPECT_FALSE(pathStartsWith(testPath("foo"), testPath("fooz")));
+  EXPECT_FALSE(pathStartsWith(testPath("foo/"), testPath("fooz")));
 
-  EXPECT_TRUE(pathStartsWith("/foo", "/foo/bar"));
-  EXPECT_TRUE(pathStartsWith("/foo/", "/foo/bar"));
+  EXPECT_TRUE(pathStartsWith(testPath("foo"), testPath("foo/bar")));
+  EXPECT_TRUE(pathStartsWith(testPath("foo/"), testPath("foo/bar")));
 
 #ifdef CLANGD_PATH_CASE_INSENSITIVE
-  EXPECT_TRUE(pathStartsWith("/fOo", "/foo/bar"));
-  EXPECT_TRUE(pathStartsWith("/foo", "/fOo/bar"));
+  EXPECT_TRUE(pathStartsWith(testPath("fOo"), testPath("foo/bar")));
+  EXPECT_TRUE(pathStartsWith(testPath("foo"), testPath("fOo/bar")));
 #else
-  EXPECT_FALSE(pathStartsWith("/fOo", "/foo/bar"));
-  EXPECT_FALSE(pathStartsWith("/foo", "/fOo/bar"));
+  EXPECT_FALSE(pathStartsWith(testPath("fOo"), testPath("foo/bar")));
+  EXPECT_FALSE(pathStartsWith(testPath("foo"), testPath("fOo/bar")));
 #endif
 }
 } // namespace

From 67d6fbe0f157ba78e8131964d60155dc1090f409 Mon Sep 17 00:00:00 2001
From: Sam McCall <sam.mccall@gmail.com>
Date: Mon, 22 Feb 2021 22:05:26 +0100
Subject: [PATCH 160/318] [clangd] Release notes for 12.x

---
 clang-tools-extra/docs/ReleaseNotes.rst | 169 ++++++++++++++++++++++++
 1 file changed, 169 insertions(+)

diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 2960aad5a556..64b3d224ff6f 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -47,6 +47,9 @@ Major New Features
 Improvements to clangd
 ----------------------
 
+Performance
+^^^^^^^^^^^
+
 - clangd's memory usage is significantly reduced on most Linux systems.
   In particular, memory usage should not increase dramatically over time.
 
@@ -59,6 +62,172 @@ Improvements to clangd
   systems can disable this using ``--malloc_trim=0`` or the CMake flag
   ``-DCLANGD_MALLOC_TRIM=0``.
 
+- Added the `$/memoryUsage request
+  <https://clangd.llvm.org/extensions.html#memory-usage>`_: an LSP extension.
+  This provides a breakdown of the memory clangd thinks it is using (excluding
+  malloc overhead etc). The clangd VSCode extension supports showing the memory
+  usage tree.
+
+Parsing and selection
+^^^^^^^^^^^^^^^^^^^^^
+
+- Improved navigation of broken code in C using Recovery AST. (This has been
+  enabled for C++ since clangd 11).
+
+- Types are understood more often in broken code. (This is the first release
+  where Recovery AST preserves speculated types).
+
+- Heuristic resolution for dependent names in templates.
+
+Code completion
+^^^^^^^^^^^^^^^
+
+- Higher priority for symbols that were already used in this file, and symbols
+  from namespaces mentioned in this file. (Estimated 3% accuracy improvement)
+
+- Introduced a ranking algorithm trained on snippets from a large C++ codebase.
+  Use the flag ``--ranking-model=decision_forest`` to try this (Estimated 6%
+  accuracy improvement). This mode is likely to become the default in future.
+
+  Note: this is a generic model, not specialized for your code. clangd does not
+  collect any data from your code to train code completion.
+
+- Signature help works with functions with template-dependent parameter types.
+
+Go to definition
+^^^^^^^^^^^^^^^^
+
+- Selecting an ``auto`` or ``decltype`` keyword will attempt to navigate to
+  a definition of the deduced type.
+
+- Improved handling of aliases: navigate to the underlying entity more often.
+
+- Better understanding of declaration vs definition for Objective-C classes and
+  protocols.
+
+- Selecting a pure-virtual method shows its overrides.
+
+Find references
+^^^^^^^^^^^^^^^
+
+- Indexes are smarter about not returning stale references when code is deleted.
+
+- References in implementation files are always indexed, so results should be
+  more complete.
+
+- Find-references on a virtual method shows references to overridden methods.
+
+New navigation features
+^^^^^^^^^^^^^^^^^^^^^^^
+
+- Call hierarchy (``textDocument/callHierarchy``) is supported.
+  Only incoming calls are available.
+
+- Go to implementation (``textDocument/implementation``) is supported on
+  abstract classes, and on virtual methods.
+
+- Symbol search (``workspace/symbol``) queries may be partially qualified.
+  That is, typing ``b::Foo`` will match the symbol ``a::b::c::Foo``.
+
+Refactoring
+^^^^^^^^^^^
+
+- New refactoring: populate ``switch`` statement with cases.
+  (This acts as a fix for the ``-Wswitch-enum`` warning).
+
+- Renaming templates is supported, and many other complex cases were fixed.
+
+- Attempting to rename to an invalid or conflicting name can produce an error
+  message rather than broken code. (Not all cases are detected!)
+
+- The accuracy of many code actions has been improved.
+
+Hover
+^^^^^
+
+- Hovers for ``auto`` and ``decltype`` show the type in the same style as other
+  hovers. ``this`` is also now supported.
+
+- Displayed type names are more consistent and idiomatic.
+
+Semantic highlighting
+^^^^^^^^^^^^^^^^^^^^^
+
+- Inactive preprocessor regions (``#ifdef``) are highlighted as comments.
+
+- clangd 12 is the last release with support for the non-standard
+  ``textDocument/semanticHighlights`` notification. Clients sholud migrate to
+  the ``textDocument/semanticTokens`` request added in LSP 3.16.
+
+Remote index (alpha)
+^^^^^^^^^^^^^^^^^^^^
+
+- clangd can now connect to a remote index server instead of building a project
+  index locally. This saves resources in large codebases that are slow to index.
+
+- The server program is ``clangd-index-server``, and it consumes index files
+  produced by ``clangd-indexer``.
+
+- This feature requires clangd to be built with the CMake flag
+  ``-DCLANGD_ENABLE_REMOTE=On``, which requires GRPC libraries and is not
+  enabled by default. Unofficial releases of the remote-index-enabled client
+  and server tools are at https://github.com/clangd/clangd/releases
+
+- Large projects can deploy a shared server, and check in a ``.clangd`` file
+  to enable it (in the ``Index.External`` section). We hope to provide such a
+  server for ``llvm-project`` itself in the near future.
+
+Configuration
+^^^^^^^^^^^^^
+
+- Static and remote indexes can be configured in the ``Index.External`` section.
+  Different static indexes can now be used for different files.
+  (Obsoletes the flag ``--index-file``).
+
+- Diagnostics can be filtered or suppressed in the ``Diagnostics`` section.
+
+- Clang-tidy checks can be enabled/disabled in the ``Diagnostics.ClangTidy``
+  section. (Obsoletes the flag ``--clang-tidy-checks``).
+
+- The compilation database directory can be configured in the ``CompileFlags``
+  section. Different compilation databases can now be specified for different
+  files. (Obsoletes the flag ``--compile-commands-dir``).
+
+- Errors in loaded configuration files are published as LSP diagnostics, and so
+  should be shown in your editor.
+
+`Full reference of configuration options <https://clangd.llvm.org/config.html>`_
+
+System integration
+^^^^^^^^^^^^^^^^^^
+
+- Changes to ``compile_commands.json`` and ``compile_flags.txt`` will take
+  effect the next time a file is parsed, without restarting clangd.
+
+- ``clangd --check=<filename>`` can be run on the command-line to simulate
+  opening a file without actually using an editor. This can be useful to
+  reproduce crashes or aother problems.
+
+- Various fixes to handle filenames correctly (and case-insensitively) on
+  windows.
+
+- If incoming LSP messages are malformed, the logs now contain details.
+
+Miscellaneous
+^^^^^^^^^^^^^
+
+- "Show AST" request
+  (`textDocument/ast <https://clangd.llvm.org/extensions.html#ast>`_)
+  added as an LSP extension. This displays a simplified view of the clang AST
+  for selected code. The clangd VSCode extension supports this.
+
+- clangd should no longer crash while loading old or corrupt index files.
+
+- The flags ``--index``, ``--recovery-ast`` and ``-suggest-missing-includes``
+  have been retired. These features are now always enabled.
+
+- Too many stability and correctness fixes to mention.
+
 Improvements to clang-doc
 -------------------------
 

From a750a2329c433e598f7fc9655d625c5ebb6bc400 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Mon, 22 Feb 2021 16:27:19 -0800
Subject: [PATCH 161/318] clang-tidy: Disable
 cppcoreguidlines-prefer-member-initializer check

Fixes https://llvm.org/PR49318
---
 .../cppcoreguidelines/CMakeLists.txt          |   1 -
 .../CppCoreGuidelinesTidyModule.cpp           |   3 -
 .../PreferMemberInitializerCheck.cpp          | 246 ---------
 .../PreferMemberInitializerCheck.h            |  41 --
 clang-tools-extra/docs/ReleaseNotes.rst       |   6 -
 ...reguidelines-prefer-member-initializer.rst | 103 ----
 ...ize-use-default-member-init-assignment.cpp |  31 --
 ...izer-modernize-use-default-member-init.cpp |  30 --
 ...reguidelines-prefer-member-initializer.cpp | 490 ------------------
 9 files changed, 951 deletions(-)
 delete mode 100644 clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp
 delete mode 100644 clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.h
 delete mode 100644 clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines-prefer-member-initializer.rst
 delete mode 100644 clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines-prefer-member-initializer-modernize-use-default-member-init-assignment.cpp
 delete mode 100644 clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines-prefer-member-initializer-modernize-use-default-member-init.cpp
 delete mode 100644 clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines-prefer-member-initializer.cpp

diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/CMakeLists.txt b/clang-tools-extra/clang-tidy/cppcoreguidelines/CMakeLists.txt
index a9f5b3e0c15b..39c2c552eb73 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/CMakeLists.txt
@@ -13,7 +13,6 @@ add_clang_library(clangTidyCppCoreGuidelinesModule
   NarrowingConversionsCheck.cpp
   NoMallocCheck.cpp
   OwningMemoryCheck.cpp
-  PreferMemberInitializerCheck.cpp
   ProBoundsArrayToPointerDecayCheck.cpp
   ProBoundsConstantArrayIndexCheck.cpp
   ProBoundsPointerArithmeticCheck.cpp
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp
index bf613109f0eb..4cb5022888d3 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp
@@ -22,7 +22,6 @@
 #include "NarrowingConversionsCheck.h"
 #include "NoMallocCheck.h"
 #include "OwningMemoryCheck.h"
-#include "PreferMemberInitializerCheck.h"
 #include "ProBoundsArrayToPointerDecayCheck.h"
 #include "ProBoundsConstantArrayIndexCheck.h"
 #include "ProBoundsPointerArithmeticCheck.h"
@@ -67,8 +66,6 @@ class CppCoreGuidelinesModule : public ClangTidyModule {
         "cppcoreguidelines-non-private-member-variables-in-classes");
     CheckFactories.registerCheck<OwningMemoryCheck>(
         "cppcoreguidelines-owning-memory");
-    CheckFactories.registerCheck<PreferMemberInitializerCheck>(
-        "cppcoreguidelines-prefer-member-initializer");
     CheckFactories.registerCheck<ProBoundsArrayToPointerDecayCheck>(
         "cppcoreguidelines-pro-bounds-array-to-pointer-decay");
     CheckFactories.registerCheck<ProBoundsConstantArrayIndexCheck>(
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp
deleted file mode 100644
index 2d7500943860..000000000000
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp
+++ /dev/null
@@ -1,246 +0,0 @@
-//===--- PreferMemberInitializerCheck.cpp - clang-tidy -------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "PreferMemberInitializerCheck.h"
-#include "clang/AST/ASTContext.h"
-#include "clang/ASTMatchers/ASTMatchFinder.h"
-#include "clang/Lex/Lexer.h"
-
-using namespace clang::ast_matchers;
-
-namespace clang {
-namespace tidy {
-namespace cppcoreguidelines {
-
-static bool isControlStatement(const Stmt *S) {
-  return isa<IfStmt, SwitchStmt, ForStmt, WhileStmt, DoStmt, ReturnStmt,
-             GotoStmt, CXXTryStmt, CXXThrowExpr>(S);
-}
-
-static bool isNoReturnCallStatement(const Stmt *S) {
-  const auto *Call = dyn_cast<CallExpr>(S);
-  if (!Call)
-    return false;
-
-  const FunctionDecl *Func = Call->getDirectCallee();
-  if (!Func)
-    return false;
-
-  return Func->isNoReturn();
-}
-
-static bool isLiteral(const Expr *E) {
-  return isa<StringLiteral, CharacterLiteral, IntegerLiteral, FloatingLiteral,
-             CXXBoolLiteralExpr, CXXNullPtrLiteralExpr>(E);
-}
-
-static bool isUnaryExprOfLiteral(const Expr *E) {
-  if (const auto *UnOp = dyn_cast<UnaryOperator>(E))
-    return isLiteral(UnOp->getSubExpr());
-  return false;
-}
-
-static bool shouldBeDefaultMemberInitializer(const Expr *Value) {
-  if (isLiteral(Value) || isUnaryExprOfLiteral(Value))
-    return true;
-
-  if (const auto *DRE = dyn_cast<DeclRefExpr>(Value))
-    return isa<EnumConstantDecl>(DRE->getDecl());
-
-  return false;
-}
-
-static const std::pair<const FieldDecl *, const Expr *>
-isAssignmentToMemberOf(const RecordDecl *Rec, const Stmt *S) {
-  if (const auto *BO = dyn_cast<BinaryOperator>(S)) {
-    if (BO->getOpcode() != BO_Assign)
-      return std::make_pair(nullptr, nullptr);
-
-    const auto *ME = dyn_cast<MemberExpr>(BO->getLHS()->IgnoreParenImpCasts());
-    if (!ME)
-      return std::make_pair(nullptr, nullptr);
-
-    const auto *Field = dyn_cast<FieldDecl>(ME->getMemberDecl());
-    if (!Field)
-      return std::make_pair(nullptr, nullptr);
-
-    if (isa<CXXThisExpr>(ME->getBase()))
-      return std::make_pair(Field, BO->getRHS()->IgnoreParenImpCasts());
-  } else if (const auto *COCE = dyn_cast<CXXOperatorCallExpr>(S)) {
-    if (COCE->getOperator() != OO_Equal)
-      return std::make_pair(nullptr, nullptr);
-
-    const auto *ME =
-        dyn_cast<MemberExpr>(COCE->getArg(0)->IgnoreParenImpCasts());
-    if (!ME)
-      return std::make_pair(nullptr, nullptr);
-
-    const auto *Field = dyn_cast<FieldDecl>(ME->getMemberDecl());
-    if (!Field)
-      return std::make_pair(nullptr, nullptr);
-
-    if (isa<CXXThisExpr>(ME->getBase()))
-      return std::make_pair(Field, COCE->getArg(1)->IgnoreParenImpCasts());
-  }
-
-  return std::make_pair(nullptr, nullptr);
-}
-
-PreferMemberInitializerCheck::PreferMemberInitializerCheck(
-    StringRef Name, ClangTidyContext *Context)
-    : ClangTidyCheck(Name, Context),
-      IsUseDefaultMemberInitEnabled(
-          Context->isCheckEnabled("modernize-use-default-member-init")),
-      UseAssignment(OptionsView("modernize-use-default-member-init",
-                                Context->getOptions().CheckOptions, Context)
-                        .get("UseAssignment", false)) {}
-
-void PreferMemberInitializerCheck::storeOptions(
-    ClangTidyOptions::OptionMap &Opts) {
-  Options.store(Opts, "UseAssignment", UseAssignment);
-}
-
-void PreferMemberInitializerCheck::registerMatchers(MatchFinder *Finder) {
-  Finder->addMatcher(
-      cxxConstructorDecl(hasBody(compoundStmt()), unless(isInstantiated()))
-          .bind("ctor"),
-      this);
-}
-
-void PreferMemberInitializerCheck::check(
-    const MatchFinder::MatchResult &Result) {
-  const auto *Ctor = Result.Nodes.getNodeAs<CXXConstructorDecl>("ctor");
-  const auto *Body = cast<CompoundStmt>(Ctor->getBody());
-
-  const CXXRecordDecl *Class = Ctor->getParent();
-  SourceLocation InsertPos;
-  bool FirstToCtorInits = true;
-
-  for (const Stmt *S : Body->body()) {
-    if (S->getBeginLoc().isMacroID()) {
-      StringRef MacroName =
-        Lexer::getImmediateMacroName(S->getBeginLoc(), *Result.SourceManager,
-                                     getLangOpts());
-      if (MacroName.contains_lower("assert"))
-        return;
-    }
-    if (isControlStatement(S))
-      return;
-
-    if (isNoReturnCallStatement(S))
-      return;
-
-    if (const auto *CondOp = dyn_cast<ConditionalOperator>(S)) {
-      if (isNoReturnCallStatement(CondOp->getLHS()) ||
-          isNoReturnCallStatement(CondOp->getRHS()))
-        return;
-    }
-
-    const FieldDecl *Field;
-    const Expr *InitValue;
-    std::tie(Field, InitValue) = isAssignmentToMemberOf(Class, S);
-    if (Field) {
-      if (IsUseDefaultMemberInitEnabled && getLangOpts().CPlusPlus11 &&
-          Ctor->isDefaultConstructor() &&
-          (getLangOpts().CPlusPlus20 || !Field->isBitField()) &&
-          (!isa<RecordDecl>(Class->getDeclContext()) ||
-           !cast<RecordDecl>(Class->getDeclContext())->isUnion()) &&
-          shouldBeDefaultMemberInitializer(InitValue)) {
-        auto Diag =
-            diag(S->getBeginLoc(), "%0 should be initialized in an in-class"
-                                   " default member initializer")
-            << Field;
-
-        SourceLocation FieldEnd =
-            Lexer::getLocForEndOfToken(Field->getSourceRange().getEnd(), 0,
-                                       *Result.SourceManager, getLangOpts());
-        Diag << FixItHint::CreateInsertion(FieldEnd,
-                                           UseAssignment ? " = " : "{")
-             << FixItHint::CreateInsertionFromRange(
-                    FieldEnd,
-                    CharSourceRange(InitValue->getSourceRange(), true))
-             << FixItHint::CreateInsertion(FieldEnd, UseAssignment ? "" : "}");
-
-        SourceLocation SemiColonEnd =
-            Lexer::findNextToken(S->getEndLoc(), *Result.SourceManager,
-                                 getLangOpts())
-                ->getEndLoc();
-        CharSourceRange StmtRange =
-            CharSourceRange::getCharRange(S->getBeginLoc(), SemiColonEnd);
-
-        Diag << FixItHint::CreateRemoval(StmtRange);
-      } else {
-        auto Diag =
-            diag(S->getBeginLoc(), "%0 should be initialized in a member"
-                                   " initializer of the constructor")
-            << Field;
-
-        bool AddComma = false;
-        if (!Ctor->getNumCtorInitializers() && FirstToCtorInits) {
-          SourceLocation BodyPos = Ctor->getBody()->getBeginLoc();
-          SourceLocation NextPos = Ctor->getBeginLoc();
-          do {
-            InsertPos = NextPos;
-            NextPos = Lexer::findNextToken(NextPos, *Result.SourceManager,
-                                           getLangOpts())
-                          ->getLocation();
-          } while (NextPos != BodyPos);
-          InsertPos = Lexer::getLocForEndOfToken(
-              InsertPos, 0, *Result.SourceManager, getLangOpts());
-
-          Diag << FixItHint::CreateInsertion(InsertPos, " : ");
-        } else {
-          bool Found = false;
-          for (const auto *Init : Ctor->inits()) {
-            if (Init->isMemberInitializer()) {
-              if (Result.SourceManager->isBeforeInTranslationUnit(
-                      Field->getLocation(), Init->getMember()->getLocation())) {
-                InsertPos = Init->getSourceLocation();
-                Found = true;
-                break;
-              }
-            }
-          }
-
-          if (!Found) {
-            if (Ctor->getNumCtorInitializers()) {
-              InsertPos = Lexer::getLocForEndOfToken(
-                  (*Ctor->init_rbegin())->getSourceRange().getEnd(), 0,
-                  *Result.SourceManager, getLangOpts());
-            }
-            Diag << FixItHint::CreateInsertion(InsertPos, ", ");
-          } else {
-            AddComma = true;
-          }
-        }
-        Diag << FixItHint::CreateInsertion(InsertPos, Field->getName())
-             << FixItHint::CreateInsertion(InsertPos, "(")
-             << FixItHint::CreateInsertionFromRange(
-                    InsertPos,
-                    CharSourceRange(InitValue->getSourceRange(), true))
-             << FixItHint::CreateInsertion(InsertPos, ")");
-        if (AddComma)
-          Diag << FixItHint::CreateInsertion(InsertPos, ", ");
-
-        SourceLocation SemiColonEnd =
-            Lexer::findNextToken(S->getEndLoc(), *Result.SourceManager,
-                                 getLangOpts())
-                ->getEndLoc();
-        CharSourceRange StmtRange =
-            CharSourceRange::getCharRange(S->getBeginLoc(), SemiColonEnd);
-
-        Diag << FixItHint::CreateRemoval(StmtRange);
-        FirstToCtorInits = false;
-      }
-    }
-  }
-}
-
-} // namespace cppcoreguidelines
-} // namespace tidy
-} // namespace clang
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.h
deleted file mode 100644
index dbef7c98d8e3..000000000000
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.h
+++ /dev/null
@@ -1,41 +0,0 @@
-//===--- PreferMemberInitializerCheck.h - clang-tidy ------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PREFERMEMBERINITIALIZERCHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PREFERMEMBERINITIALIZERCHECK_H
-
-#include "../ClangTidyCheck.h"
-
-namespace clang {
-namespace tidy {
-namespace cppcoreguidelines {
-
-/// Finds member initializations in the constructor body which can be placed
-/// into the initialization list instead.
-///
-/// For the user-facing documentation see:
-/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines-prefer-member-initializer.html
-class PreferMemberInitializerCheck : public ClangTidyCheck {
-public:
-  PreferMemberInitializerCheck(StringRef Name, ClangTidyContext *Context);
-  bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
-    return LangOpts.CPlusPlus;
-  }
-  void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
-  void registerMatchers(ast_matchers::MatchFinder *Finder) override;
-  void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
-
-  const bool IsUseDefaultMemberInitEnabled;
-  const bool UseAssignment;
-};
-
-} // namespace cppcoreguidelines
-} // namespace tidy
-} // namespace clang
-
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PREFERMEMBERINITIALIZERCHECK_H
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 64b3d224ff6f..b3c9c829198b 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -290,12 +290,6 @@ New checks
   Finds structs that are inefficiently packed or aligned, and recommends
   packing and/or aligning of said structs as needed.
 
-- New :doc:`cppcoreguidelines-prefer-member-initializer
-  <clang-tidy/checks/cppcoreguidelines-prefer-member-initializer>` check.
-
-  Finds member initializations in the constructor body which can be placed into
-  the initialization list instead.
-
 - New :doc:`bugprone-misplaced-pointer-arithmetic-in-alloc
   <clang-tidy/checks/bugprone-misplaced-pointer-arithmetic-in-alloc>` check.
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines-prefer-member-initializer.rst b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines-prefer-member-initializer.rst
deleted file mode 100644
index 5a5ee3e57a8c..000000000000
--- a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines-prefer-member-initializer.rst
+++ /dev/null
@@ -1,103 +0,0 @@
-.. title:: clang-tidy - cppcoreguidelines-prefer-member-initializer
-
-cppcoreguidelines-prefer-member-initializer
-===========================================
-
-Finds member initializations in the constructor body which can be  converted
-into member initializers of the constructor instead. This not only improves
-the readability of the code but also positively affects its performance.
-Class-member assignments inside a control statement or following the first
-control statement are ignored.
-
-This check implements `C.49 <https://github.com/isocpp/CppCoreGuidelines/blob/master/CppCoreGuidelines.md#c49-prefer-initialization-to-assignment-in-constructors>`_ from the CppCoreGuidelines.
-
-If the language version is `C++ 11` or above, the constructor is the default
-constructor of the class, the field is not a bitfield (only in case of earlier
-language version than `C++ 20`), furthermore the assigned value is a literal,
-negated literal or ``enum`` constant then the preferred place of the
-initialization is at the class member declaration.
-
-This latter rule is `C.48 <https://github.com/isocpp/CppCoreGuidelines/blob/master/CppCoreGuidelines.md#c48-prefer-in-class-initializers-to-member-initializers-in-constructors-for-constant-initializers>`_ from CppCoreGuidelines.
-
-Please note, that this check does not enforce this latter rule for
-initializations already implemented as member initializers. For that purpose
-see check `modernize-use-default-member-init <modernize-use-default-member-init.html>`_.
-
-Example 1
----------
-
-.. code-block:: c++
-
-  class C {
-    int n;
-    int m;
-  public:
-    C() {
-      n = 1; // Literal in default constructor
-      if (dice())
-        return;
-      m = 1;
-    }
-  };
-
-Here ``n`` can be initialized using a default member initializer, unlike
-``m``, as ``m``'s initialization follows a control statement (``if``):
-
-.. code-block:: c++
-
-  class C {
-    int n{1};
-    int m;
-  public:
-    C() {
-      if (dice())
-        return;
-      m = 1;
-    }
-
-Example 2
----------
-
-.. code-block:: c++
-
-  class C {
-    int n;
-    int m;
-  public:
-    C(int nn, int mm) {
-      n = nn; // Neither default constructor nor literal
-      if (dice())
-        return;
-      m = mm;
-    }
-  };
-
-Here ``n`` can be initialized in the constructor initialization list, unlike
-``m``, as ``m``'s initialization follows a control statement (``if``):
-
-.. code-block:: c++
-
-  C(int nn, int mm) : n(nn) {
-    if (dice())
-      return;
-    m = mm;
-  }
-
-.. option:: UseAssignment
-
-   If this option is set to `true` (default is `false`), the check will initialize
-   members with an assignment. In this case the fix of the first example looks
-   like this:
-
-.. code-block:: c++
-
-  class C {
-    int n = 1;
-    int m;
-  public:
-    C() {
-      if (dice())
-        return;
-      m = 1;
-    }
-  };
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines-prefer-member-initializer-modernize-use-default-member-init-assignment.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines-prefer-member-initializer-modernize-use-default-member-init-assignment.cpp
deleted file mode 100644
index dc6cb7606a0d..000000000000
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines-prefer-member-initializer-modernize-use-default-member-init-assignment.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// RUN: %check_clang_tidy %s cppcoreguidelines-prefer-member-initializer,modernize-use-default-member-init %t -- \
-// RUN: -config="{CheckOptions: [{key: modernize-use-default-member-init.UseAssignment, value: 1}]}"
-
-class Simple1 {
-  int n;
-  // CHECK-FIXES: int n = 0;
-  double x;
-  // CHECK-FIXES: double x = 0.0;
-
-public:
-  Simple1() {
-    n = 0;
-    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'n' should be initialized in an in-class default member initializer [cppcoreguidelines-prefer-member-initializer]
-    // CHECK-FIXES: {{^\ *$}}
-    x = 0.0;
-    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'x' should be initialized in an in-class default member initializer [cppcoreguidelines-prefer-member-initializer]
-    // CHECK-FIXES: {{^\ *$}}
-  }
-
-  Simple1(int nn, double xx) {
-    // CHECK-FIXES: Simple1(int nn, double xx) : n(nn), x(xx) {
-    n = nn;
-    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'n' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer]
-    // CHECK-FIXES: {{^\ *$}}
-    x = xx;
-    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'x' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer]
-    // CHECK-FIXES: {{^\ *$}}
-  }
-
-  ~Simple1() = default;
-};
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines-prefer-member-initializer-modernize-use-default-member-init.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines-prefer-member-initializer-modernize-use-default-member-init.cpp
deleted file mode 100644
index fe5bb7c3bb98..000000000000
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines-prefer-member-initializer-modernize-use-default-member-init.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-// RUN: %check_clang_tidy %s cppcoreguidelines-prefer-member-initializer,modernize-use-default-member-init %t
-
-class Simple1 {
-  int n;
-  // CHECK-FIXES: int n{0};
-  double x;
-  // CHECK-FIXES: double x{0.0};
-
-public:
-  Simple1() {
-    n = 0;
-    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'n' should be initialized in an in-class default member initializer [cppcoreguidelines-prefer-member-initializer]
-    // CHECK-FIXES: {{^\ *$}}
-    x = 0.0;
-    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'x' should be initialized in an in-class default member initializer [cppcoreguidelines-prefer-member-initializer]
-    // CHECK-FIXES: {{^\ *$}}
-  }
-
-  Simple1(int nn, double xx) {
-    // CHECK-FIXES: Simple1(int nn, double xx) : n(nn), x(xx) {
-    n = nn;
-    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'n' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer]
-    // CHECK-FIXES: {{^\ *$}}
-    x = xx;
-    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'x' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer]
-    // CHECK-FIXES: {{^\ *$}}
-  }
-
-  ~Simple1() = default;
-};
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines-prefer-member-initializer.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines-prefer-member-initializer.cpp
deleted file mode 100644
index b5c04c32c9fa..000000000000
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines-prefer-member-initializer.cpp
+++ /dev/null
@@ -1,490 +0,0 @@
-// RUN: %check_clang_tidy %s cppcoreguidelines-prefer-member-initializer %t -- -- -fcxx-exceptions
-
-extern void __assert_fail (__const char *__assertion, __const char *__file,
-    unsigned int __line, __const char *__function)
-     __attribute__ ((__noreturn__));
-#define assert(expr) \
-  ((expr)  ? (void)(0)  : __assert_fail (#expr, __FILE__, __LINE__, __func__))
-
-class Simple1 {
-  int n;
-  double x;
-
-public:
-  Simple1() {
-    // CHECK-FIXES: Simple1() : n(0), x(0.0) {
-    n = 0;
-    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'n' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer]
-    // CHECK-FIXES: {{^\ *$}}
-    x = 0.0;
-    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'x' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer]
-    // CHECK-FIXES: {{^\ *$}}
-  }
-
-  Simple1(int nn, double xx) {
-    // CHECK-FIXES: Simple1(int nn, double xx) : n(nn), x(xx) {
-    n = nn;
-    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'n' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer]
-    // CHECK-FIXES: {{^\ *$}}
-    x = xx;
-    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'x' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer]
-    // CHECK-FIXES: {{^\ *$}}
-  }
-
-  ~Simple1() = default;
-};
-
-class Simple2 {
-  int n;
-  double x;
-
-public:
-  Simple2() : n(0) {
-    // CHECK-FIXES: Simple2() : n(0), x(0.0) {
-    x = 0.0;
-    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'x' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer]
-    // CHECK-FIXES: {{^\ *$}}
-  }
-
-  Simple2(int nn, double xx) : n(nn) {
-    // CHECK-FIXES: Simple2(int nn, double xx) : n(nn), x(xx) {
-    x = xx;
-    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'x' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer]
-    // CHECK-FIXES: {{^\ *$}}
-  }
-
-  ~Simple2() = default;
-};
-
-class Simple3 {
-  int n;
-  double x;
-
-public:
-  Simple3() : x(0.0) {
-    // CHECK-FIXES: Simple3() : n(0), x(0.0) {
-    n = 0;
-    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'n' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer]
-    // CHECK-FIXES: {{^\ *$}}
-  }
-
-  Simple3(int nn, double xx) : x(xx) {
-    // CHECK-FIXES: Simple3(int nn, double xx) : n(nn), x(xx) {
-    n = nn;
-    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'n' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer]
-    // CHECK-FIXES: {{^\ *$}}
-  }
-
-  ~Simple3() = default;
-};
-
-int something_int();
-double something_double();
-
-class Simple4 {
-  int n;
-
-public:
-  Simple4() {
-    // CHECK-FIXES: Simple4() : n(something_int()) {
-    n = something_int();
-    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'n' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer]
-    // CHECK-FIXES: {{^\ *$}}
-  }
-
-  ~Simple4() = default;
-};
-
-static bool dice();
-
-class Complex1 {
-  int n;
-  int m;
-
-public:
-  Complex1() : n(0) {
-    if (dice())
-      m = 1;
-    // NO-MESSAGES: initialization of 'm' is nested in a conditional expression
-  }
-
-  ~Complex1() = default;
-};
-
-class Complex2 {
-  int n;
-  int m;
-
-public:
-  Complex2() : n(0) {
-    if (!dice())
-      return;
-    m = 1;
-    // NO-MESSAGES: initialization of 'm' follows a conditional expression
-  }
-
-  ~Complex2() = default;
-};
-
-class Complex3 {
-  int n;
-  int m;
-
-public:
-  Complex3() : n(0) {
-    while (dice())
-      m = 1;
-    // NO-MESSAGES: initialization of 'm' is nested in a conditional loop
-  }
-
-  ~Complex3() = default;
-};
-
-class Complex4 {
-  int n;
-  int m;
-
-public:
-  Complex4() : n(0) {
-    while (!dice())
-      return;
-    m = 1;
-    // NO-MESSAGES: initialization of 'm' follows a conditional loop
-  }
-
-  ~Complex4() = default;
-};
-
-class Complex5 {
-  int n;
-  int m;
-
-public:
-  Complex5() : n(0) {
-    do {
-      m = 1;
-      // NO-MESSAGES: initialization of 'm' is nested in a conditional loop
-    } while (dice());
-  }
-
-  ~Complex5() = default;
-};
-
-class Complex6 {
-  int n;
-  int m;
-
-public:
-  Complex6() : n(0) {
-    do {
-      return;
-    } while (!dice());
-    m = 1;
-    // NO-MESSAGES: initialization of 'm' follows a conditional loop
-  }
-
-  ~Complex6() = default;
-};
-
-class Complex7 {
-  int n;
-  int m;
-
-public:
-  Complex7() : n(0) {
-    for (int i = 2; i < 1; ++i) {
-      m = 1;
-    }
-    // NO-MESSAGES: initialization of 'm' is nested into a conditional loop
-  }
-
-  ~Complex7() = default;
-};
-
-class Complex8 {
-  int n;
-  int m;
-
-public:
-  Complex8() : n(0) {
-    for (int i = 0; i < 2; ++i) {
-      return;
-    }
-    m = 1;
-    // NO-MESSAGES: initialization of 'm' follows a conditional loop
-  }
-
-  ~Complex8() = default;
-};
-
-class Complex9 {
-  int n;
-  int m;
-
-public:
-  Complex9() : n(0) {
-    switch (dice()) {
-    case 1:
-      m = 1;
-      // NO-MESSAGES: initialization of 'm' is nested in a conditional expression
-      break;
-    default:
-      break;
-    }
-  }
-
-  ~Complex9() = default;
-};
-
-class Complex10 {
-  int n;
-  int m;
-
-public:
-  Complex10() : n(0) {
-    switch (dice()) {
-    case 1:
-      return;
-      break;
-    default:
-      break;
-    }
-    m = 1;
-    // NO-MESSAGES: initialization of 'm' follows a conditional expression
-  }
-
-  ~Complex10() = default;
-};
-
-class E {};
-int risky(); // may throw
-
-class Complex11 {
-  int n;
-  int m;
-
-public:
-  Complex11() : n(0) {
-    try {
-      risky();
-      m = 1;
-      // NO-MESSAGES: initialization of 'm' follows is nested in a try-block
-    } catch (const E& e) {
-      return;
-    }
-  }
-
-  ~Complex11() = default;
-};
-
-class Complex12 {
-  int n;
-  int m;
-
-public:
-  Complex12() : n(0) {
-    try {
-      risky();
-    } catch (const E& e) {
-      return;
-    }
-    m = 1;
-    // NO-MESSAGES: initialization of 'm' follows a try-block
-  }
-
-  ~Complex12() = default;
-};
-
-class Complex13 {
-  int n;
-  int m;
-
-public:
-  Complex13() : n(0) {
-    return;
-    m = 1;
-    // NO-MESSAGES: initialization of 'm' follows a return statement
-  }
-
-  ~Complex13() = default;
-};
-
-class Complex14 {
-  int n;
-  int m;
-
-public:
-  Complex14() : n(0) {
-    goto X;
-    m = 1;
-    // NO-MESSAGES: initialization of 'm' follows a goto statement
-  X:
-    ;
-  }
-
-  ~Complex14() = default;
-};
-
-void returning();
-
-class Complex15 {
-  int n;
-  int m;
-
-public:
-  Complex15() : n(0) {
-    // CHECK-FIXES: Complex15() : n(0), m(1) {
-    returning();
-    m = 1;
-    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'm' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer]
-    // CHECK-FIXES: {{^\ *$}}
-  }
-
-  ~Complex15() = default;
-};
-
-[[noreturn]] void not_returning();
-
-class Complex16 {
-  int n;
-  int m;
-
-public:
-  Complex16() : n(0) {
-    not_returning();
-    m = 1;
-    // NO-MESSAGES: initialization of 'm' follows a non-returning function call
-  }
-
-  ~Complex16() = default;
-};
-
-class Complex17 {
-  int n;
-  int m;
-
-public:
-  Complex17() : n(0) {
-    throw 1;
-    m = 1;
-    // NO-MESSAGES: initialization of 'm' follows a 'throw' statement;
-  }
-
-  ~Complex17() = default;
-};
-
-class Complex18 {
-  int n;
-
-public:
-  Complex18() try {
-    n = risky();
-    // NO-MESSAGES: initialization of 'n' in a 'try' body;
-  } catch (const E& e) {
-    n = 0;
-  }
-
-  ~Complex18() = default;
-};
-
-class Complex19 {
-  int n;
-public:
-  Complex19() {
-    // CHECK-FIXES: Complex19() : n(0) {
-    n = 0;
-    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'n' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer]
-    // CHECK-FIXES: {{^\ *$}}
-  }
-
-  explicit Complex19(int) {
-    // CHECK-FIXES: Complex19(int) : n(12) {
-    n = 12;
-    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'n' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer]
-    // CHECK-FIXES: {{^\ *$}}
-  }
-
-  ~Complex19() = default;
-};
-
-class Complex20 {
-  int n;
-  int m;
-
-public:
-  Complex20(int k) : n(0) {
-    assert(k > 0);
-    m = 1;
-    // NO-MESSAGES: initialization of 'm' follows an assertion
-  }
-
-  ~Complex20() = default;
-};
-
-class VeryComplex1 {
-  int n1, n2, n3;
-  double x1, x2, x3;
-  int n4, n5, n6;
-  double x4, x5, x6;
-
-  VeryComplex1() : n3(something_int()), x3(something_double()),
-                   n5(something_int()), x4(something_double()),
-                   x5(something_double()) {
-    // CHECK-FIXES: VeryComplex1() : n2(something_int()), n1(something_int()), n3(something_int()), x2(something_double()), x1(something_double()), x3(something_double()),
-    // CHECK-FIXES:                  n4(something_int()), n5(something_int()), n6(something_int()), x4(something_double()),
-    // CHECK-FIXES:                  x5(something_double()), x6(something_double()) {
-
-// FIXME: Order of elements on the constructor initializer list should match
-//        the order of the declaration of the fields. Thus the correct fixes
-//        should look like these:
-//
-    // C ECK-FIXES: VeryComplex1() : n2(something_int()), n1(something_int()), n3(something_int()), x2(something_double()), x1(something_double()), x3(something_double()),
-    // C ECK-FIXES:                  n4(something_int()), n5(something_int()), n6(something_int()), x4(something_double()),
-    // C ECK-FIXES:                  x5(something_double()), x6(something_double()) {
-//
-//        However, the Diagnostics Engine processes fixes in the order of the
-//        diagnostics and insertions to the same position are handled in left to
-//        right order thus in the case two adjacent fields are initialized
-//        inside the constructor in reverse order the provided fix is a
-//        constructor initializer list that does not match the order of the
-//        declaration of the fields.
-
-    x2 = something_double();
-    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'x2' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer]
-    // CHECK-FIXES: {{^\ *$}}
-    n2 = something_int();
-    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'n2' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer]
-    // CHECK-FIXES: {{^\ *$}}
-    x6 = something_double();
-    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'x6' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer]
-    // CHECK-FIXES: {{^\ *$}}
-    x1 = something_double();
-    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'x1' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer]
-    // CHECK-FIXES: {{^\ *$}}
-    n6 = something_int();
-    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'n6' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer]
-    // CHECK-FIXES: {{^\ *$}}
-    n1 = something_int();
-    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'n1' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer]
-    // CHECK-FIXES: {{^\ *$}}
-    n4 = something_int();
-    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'n4' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer]
-    // CHECK-FIXES: {{^\ *$}}
-  }
-};
-
-struct Outside {
-  int n;
-  double x;
-  Outside();
-};
-
-Outside::Outside() {
-    // CHECK-FIXES: Outside::Outside() : n(1), x(1.0) {
-  n = 1;
-    // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: 'n' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer]
-    // CHECK-FIXES: {{^\ *$}}
-  x = 1.0;
-    // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: 'x' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer]
-    // CHECK-FIXES: {{^\ *$}}
-}

From da7fa7457800394d610e8cbd6befe7bc944ca7d0 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Sun, 21 Feb 2021 16:24:18 +0100
Subject: [PATCH 162/318] [JumpThreading] Clone noalias.scope.decl when
 threading blocks

When cloning instructions during jump threading, also clone and
adapt any declared scopes. This is primarily important when
threading loop exits, because we'll end up with two dominating
scope declarations in that case (at least after additional loop
rotation). This addresses a loose thread from
https://reviews.llvm.org/rG2556b413a7b8#975012.

Differential Revision: https://reviews.llvm.org/D97154

(cherry picked from commit 5e7e499b912d2c9ebaa91b5783ca123dbedeabcc)
---
 llvm/include/llvm/Transforms/Utils/Cloning.h  |  7 +++
 llvm/lib/Transforms/Scalar/JumpThreading.cpp  | 10 +++
 llvm/lib/Transforms/Utils/CloneFunction.cpp   |  8 +++
 .../JumpThreading/noalias-scope-decl.ll       | 63 +++++++++++++++++++
 4 files changed, 88 insertions(+)
 create mode 100644 llvm/test/Transforms/JumpThreading/noalias-scope-decl.ll

diff --git a/llvm/include/llvm/Transforms/Utils/Cloning.h b/llvm/include/llvm/Transforms/Utils/Cloning.h
index 56aaa5d48e2a..aa960c625630 100644
--- a/llvm/include/llvm/Transforms/Utils/Cloning.h
+++ b/llvm/include/llvm/Transforms/Utils/Cloning.h
@@ -274,6 +274,13 @@ void updateProfileCallee(
 void identifyNoAliasScopesToClone(
     ArrayRef<BasicBlock *> BBs, SmallVectorImpl<MDNode *> &NoAliasDeclScopes);
 
+/// Find the 'llvm.experimental.noalias.scope.decl' intrinsics in the specified
+/// instruction range and extract their scope. These are candidates for
+/// duplication when cloning.
+void identifyNoAliasScopesToClone(
+    BasicBlock::iterator Start, BasicBlock::iterator End,
+    SmallVectorImpl<MDNode *> &NoAliasDeclScopes);
+
 /// Duplicate the specified list of noalias decl scopes.
 /// The 'Ext' string is added as an extension to the name.
 /// Afterwards, the ClonedScopes contains the mapping of the original scope
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 96aef90c1c1a..10b08b4e2224 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -2076,6 +2076,15 @@ JumpThreadingPass::cloneInstructions(BasicBlock::iterator BI,
     ValueMapping[PN] = NewPN;
   }
 
+  // Clone noalias scope declarations in the threaded block. When threading a
+  // loop exit, we would otherwise end up with two idential scope declarations
+  // visible at the same time.
+  SmallVector<MDNode *> NoAliasScopes;
+  DenseMap<MDNode *, MDNode *> ClonedScopes;
+  LLVMContext &Context = PredBB->getContext();
+  identifyNoAliasScopesToClone(BI, BE, NoAliasScopes);
+  cloneNoAliasScopes(NoAliasScopes, ClonedScopes, "thread", Context);
+
   // Clone the non-phi instructions of the source basic block into NewBB,
   // keeping track of the mapping and using it to remap operands in the cloned
   // instructions.
@@ -2084,6 +2093,7 @@ JumpThreadingPass::cloneInstructions(BasicBlock::iterator BI,
     New->setName(BI->getName());
     NewBB->getInstList().push_back(New);
     ValueMapping[&*BI] = New;
+    adaptNoAliasScopes(New, ClonedScopes, Context);
 
     // Remap operands to patch up intra-block references.
     for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp
index 51a49574e55d..6ab061510a60 100644
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -989,3 +989,11 @@ void llvm::identifyNoAliasScopesToClone(
       if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I))
         NoAliasDeclScopes.push_back(Decl->getScopeList());
 }
+
+void llvm::identifyNoAliasScopesToClone(
+    BasicBlock::iterator Start, BasicBlock::iterator End,
+    SmallVectorImpl<MDNode *> &NoAliasDeclScopes) {
+  for (Instruction &I : make_range(Start, End))
+    if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I))
+      NoAliasDeclScopes.push_back(Decl->getScopeList());
+}
diff --git a/llvm/test/Transforms/JumpThreading/noalias-scope-decl.ll b/llvm/test/Transforms/JumpThreading/noalias-scope-decl.ll
new file mode 100644
index 000000000000..b032afaaf313
--- /dev/null
+++ b/llvm/test/Transforms/JumpThreading/noalias-scope-decl.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -jump-threading < %s | FileCheck %s
+
+define void @test(i8* %ptr) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata !0)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_INC:%.*]], [[LATCH:%.*]] ]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[I]], 100
+; CHECK-NEXT:    br i1 [[C]], label [[EXIT:%.*]], label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata !3)
+; CHECK-NEXT:    store i8 0, i8* [[PTR:%.*]], align 1, !noalias !0
+; CHECK-NEXT:    store i8 1, i8* [[PTR]], align 1, !noalias !3
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata !5)
+; CHECK-NEXT:    store i8 0, i8* [[PTR]], align 1, !noalias !0
+; CHECK-NEXT:    store i8 1, i8* [[PTR]], align 1, !noalias !5
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.experimental.noalias.scope.decl(metadata !0)
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %latch ]
+  %c = icmp eq i32 %i, 100
+  br i1 %c, label %if, label %latch
+
+if:
+  br label %latch
+
+latch:
+  %p = phi i1 [ true, %if ], [ false, %loop ]
+  call void @llvm.experimental.noalias.scope.decl(metadata !3)
+  store i8 0, i8* %ptr, !noalias !0
+  store i8 1, i8* %ptr, !noalias !3
+  %i.inc = add i32 %i, 1
+  br i1 %p, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+declare void @llvm.experimental.noalias.scope.decl(metadata)
+
+!0 = !{!1}
+!1 = distinct !{!1, !2, !"scope1"}
+!2 = distinct !{!2, !"domain"}
+!3 = !{!4}
+!4 = distinct !{!4, !2, !"scope2"}
+
+; CHECK: !0 = !{!1}
+; CHECK: !1 = distinct !{!1, !2, !"scope1"}
+; CHECK: !2 = distinct !{!2, !"domain"}
+; CHECK: !3 = !{!4}
+; CHECK: !4 = distinct !{!4, !2, !"scope2"}
+; CHECK: !5 = !{!6}
+; CHECK: !6 = distinct !{!6, !2, !"scope2:thread"}

From a92ceea91116e7b95d23eff634507fa2cff86ef2 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Mon, 22 Feb 2021 17:35:09 -0800
Subject: [PATCH 163/318] Revert "[llvm-cov] reset executation count to 0 after
 wrapped segment"

This reverts commit e3df9471750935876bd2bf7da93ccf0eacca8592.

This commit caused regressions in coverage generation for both Rust and
Swift.  We're reverting this in the release/12.x branch until we have
a proper fix in trunk.

http://llvm.org/PR49297
---
 llvm/lib/ProfileData/Coverage/CoverageMapping.cpp   | 1 -
 llvm/test/tools/llvm-cov/Inputs/instrprof-comdat.h  | 2 +-
 llvm/test/tools/llvm-cov/ignore-filename-regex.test | 4 ++--
 llvm/unittests/ProfileData/CoverageMappingTest.cpp  | 2 +-
 4 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index a8cc308b4e3a..cdbcde50d33a 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -794,7 +794,6 @@ LineCoverageStats::LineCoverageStats(
     ExecutionCount = WrappedSegment->Count;
   if (!MinRegionCount)
     return;
-  ExecutionCount = 0;
   for (const auto *LS : LineSegments)
     if (isStartOfRegion(LS))
       ExecutionCount = std::max(ExecutionCount, LS->Count);
diff --git a/llvm/test/tools/llvm-cov/Inputs/instrprof-comdat.h b/llvm/test/tools/llvm-cov/Inputs/instrprof-comdat.h
index d224fd0d00ea..07941f9bb497 100644
--- a/llvm/test/tools/llvm-cov/Inputs/instrprof-comdat.h
+++ b/llvm/test/tools/llvm-cov/Inputs/instrprof-comdat.h
@@ -12,7 +12,7 @@ template <class T> T FOO<T>::DoIt(T ti) { // HEADER: [[@LINE]]|  2|template
   for (T I = 0; I < ti; I++) {            // HEADER: [[@LINE]]| 22|  for (T
     t += I;                               // HEADER: [[@LINE]]| 20|   t += I;
     if (I > ti / 2)                       // HEADER: [[@LINE]]| 20|   if (I > ti
-      t -= 1;                             // HEADER: [[@LINE]]|  8|     t -= 1;
+      t -= 1;                             // HEADER: [[@LINE]]| 20|     t -= 1;
   }                                       // HEADER: [[@LINE]]| 20| }
                                           // HEADER: [[@LINE]]|  2|
   return t;                               // HEADER: [[@LINE]]|  2|  return t;
diff --git a/llvm/test/tools/llvm-cov/ignore-filename-regex.test b/llvm/test/tools/llvm-cov/ignore-filename-regex.test
index efc4cda4abc0..aea9e4646776 100644
--- a/llvm/test/tools/llvm-cov/ignore-filename-regex.test
+++ b/llvm/test/tools/llvm-cov/ignore-filename-regex.test
@@ -22,7 +22,7 @@ REPORT_IGNORE_DIR-NOT: {{.*}}extra{{[/\\]}}dec.h{{.*}}
 REPORT_IGNORE_DIR-NOT: {{.*}}extra{{[/\\]}}inc.h{{.*}}
 REPORT_IGNORE_DIR: {{.*}}abs.h{{.*}}
 REPORT_IGNORE_DIR: {{.*}}main.cc{{.*}}
-REPORT_IGNORE_DIR: {{^}}TOTAL 5{{.*}}90.00%{{$}}
+REPORT_IGNORE_DIR: {{^}}TOTAL 5{{.*}}100.00%{{$}}
 
 # Ignore all files from "extra" directory even when SOURCES specified.
 RUN: llvm-cov report -instr-profile %S/Inputs/sources_specified/main.profdata \
@@ -35,7 +35,7 @@ REPORT_IGNORE_DIR_WITH_SOURCES-NOT: {{.*}}extra{{[/\\]}}dec.h{{.*}}
 REPORT_IGNORE_DIR_WITH_SOURCES-NOT: {{.*}}extra{{[/\\]}}inc.h{{.*}}
 REPORT_IGNORE_DIR_WITH_SOURCES-NOT: {{.*}}main.cc{{.*}}
 REPORT_IGNORE_DIR_WITH_SOURCES: {{.*}}abs.h{{.*}}
-REPORT_IGNORE_DIR_WITH_SOURCES: {{^}}TOTAL 4{{.*}}80.00%{{$}}
+REPORT_IGNORE_DIR_WITH_SOURCES: {{^}}TOTAL 4{{.*}}100.00%{{$}}
 
 ########################
 # Test "show" command.
diff --git a/llvm/unittests/ProfileData/CoverageMappingTest.cpp b/llvm/unittests/ProfileData/CoverageMappingTest.cpp
index 43386d23883e..4854b7f1454c 100644
--- a/llvm/unittests/ProfileData/CoverageMappingTest.cpp
+++ b/llvm/unittests/ProfileData/CoverageMappingTest.cpp
@@ -675,7 +675,7 @@ TEST_P(CoverageMappingTest, test_line_coverage_iterator) {
   CoverageData Data = LoadedCoverage->getCoverageForFile("file1");
 
   unsigned Line = 0;
-  unsigned LineCounts[] = {20, 20, 20, 20, 10, 10, 10, 10, 10, 0, 0};
+  unsigned LineCounts[] = {20, 20, 20, 20, 30, 10, 10, 10, 10, 0, 0};
   for (const auto &LCS : getLineCoverageStats(Data)) {
     ASSERT_EQ(Line + 1, LCS.getLine());
     errs() << "Line: " << Line + 1 << ", count = " << LCS.getExecutionCount() << "\n";

From 99df95fd910becbcf89dd6f17f1e259353a72d27 Mon Sep 17 00:00:00 2001
From: Kadir Cetinkaya <kadircet@google.com>
Date: Wed, 3 Feb 2021 12:45:46 +0100
Subject: [PATCH 164/318] [clang][CodeComplete] Fix crash on ParenListExprs

Fixes https://github.com/clangd/clangd/issues/676.

Differential Revision: https://reviews.llvm.org/D95935
---
 clang/lib/Sema/SemaCodeComplete.cpp            | 18 ++++++++++++++++--
 .../test/CodeCompletion/function-overloads.cpp |  6 ++++++
 clang/test/CodeCompletion/member-access.c      |  7 +++++++
 clang/unittests/Sema/CodeCompleteTest.cpp      |  1 +
 4 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Sema/SemaCodeComplete.cpp b/clang/lib/Sema/SemaCodeComplete.cpp
index c2785fd60fc2..40ea0f5d24b3 100644
--- a/clang/lib/Sema/SemaCodeComplete.cpp
+++ b/clang/lib/Sema/SemaCodeComplete.cpp
@@ -5168,6 +5168,15 @@ void Sema::CodeCompleteMemberReferenceExpr(Scope *S, Expr *Base,
   if (!Base || !CodeCompleter)
     return;
 
+  // Peel off the ParenListExpr by chosing the last one, as they don't have a
+  // predefined type.
+  if (auto *PLE = llvm::dyn_cast<ParenListExpr>(Base))
+    Base = PLE->getExpr(PLE->getNumExprs() - 1);
+  if (OtherOpBase) {
+    if (auto *PLE = llvm::dyn_cast<ParenListExpr>(OtherOpBase))
+      OtherOpBase = PLE->getExpr(PLE->getNumExprs() - 1);
+  }
+
   ExprResult ConvertedBase = PerformMemberExprBaseConversion(Base, IsArrow);
   if (ConvertedBase.isInvalid())
     return;
@@ -5597,12 +5606,17 @@ ProduceSignatureHelp(Sema &SemaRef, Scope *S,
 QualType Sema::ProduceCallSignatureHelp(Scope *S, Expr *Fn,
                                         ArrayRef<Expr *> Args,
                                         SourceLocation OpenParLoc) {
-  if (!CodeCompleter)
+  if (!CodeCompleter || !Fn)
     return QualType();
 
+  // If we have a ParenListExpr for LHS, peel it off by chosing the last expr.
+  // As ParenListExprs don't have a predefined type.
+  if (auto *PLE = llvm::dyn_cast<ParenListExpr>(Fn))
+    Fn = PLE->getExpr(PLE->getNumExprs() - 1);
+
   // FIXME: Provide support for variadic template functions.
   // Ignore type-dependent call expressions entirely.
-  if (!Fn || Fn->isTypeDependent() || anyNullArguments(Args))
+  if (Fn->isTypeDependent() || anyNullArguments(Args))
     return QualType();
   // In presence of dependent args we surface all possible signatures using the
   // non-dependent args in the prefix. Afterwards we do a post filtering to make
diff --git a/clang/test/CodeCompletion/function-overloads.cpp b/clang/test/CodeCompletion/function-overloads.cpp
index 11c864c28107..7b8ccef1d580 100644
--- a/clang/test/CodeCompletion/function-overloads.cpp
+++ b/clang/test/CodeCompletion/function-overloads.cpp
@@ -21,6 +21,8 @@ namespace NS {
 void test_adl() {
   NS::X x;
   g(x, x);
+  (void)(f)(1, 2, 3);
+  (void)(test, test, test, f)(1, 2, 3);
 }
 
 // RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:10:9 %s -o - | FileCheck -check-prefix=CHECK-CC1 %s
@@ -31,6 +33,10 @@ void test_adl() {
 // RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:10:21 %s -o - | FileCheck -check-prefix=CHECK-CC4 %s
 // RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:23:7 %s -o - | \
 // RUN:    FileCheck -check-prefix=CHECK-CC5 %s
+// RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:24:13 %s -o - | \
+// RUN:    FileCheck -check-prefix=CHECK-CC1 %s
+// RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:25:31 %s -o - | \
+// RUN:    FileCheck -check-prefix=CHECK-CC1 %s
 // CHECK-CC1: OVERLOAD: [#int#]f(<#float x#>, float y)
 // CHECK-CC1: OVERLOAD: [#int#]f(<#int i#>)
 // CHECK-CC1-NOT, CHECK-CC2-NOT: OVERLOAD: A(
diff --git a/clang/test/CodeCompletion/member-access.c b/clang/test/CodeCompletion/member-access.c
index 72afbf2ff947..545349f71731 100644
--- a/clang/test/CodeCompletion/member-access.c
+++ b/clang/test/CodeCompletion/member-access.c
@@ -29,3 +29,10 @@ void test3(struct Point2 *p) {
 
 // RUN: %clang_cc1 -fsyntax-only -code-completion-with-fixits -code-completion-at=%s:24:5 %s -o - | FileCheck -check-prefix=CHECK-CC3 %s
 // CHECK-CC3: x (requires fix-it: {24:4-24:5} to "->")
+
+void test4(struct Point *p) {
+  (int)(p)->x;
+  (int)(0,1,2,3,4,p)->x;
+}
+// RUN: %clang_cc1 -fsyntax-only -code-completion-with-fixits -code-completion-at=%s:34:13 %s -o - | FileCheck -check-prefix=CHECK-CC1 %s
+// RUN: %clang_cc1 -fsyntax-only -code-completion-with-fixits -code-completion-at=%s:35:23 %s -o - | FileCheck -check-prefix=CHECK-CC1 %s
diff --git a/clang/unittests/Sema/CodeCompleteTest.cpp b/clang/unittests/Sema/CodeCompleteTest.cpp
index d8b303d77bb9..dae0793658c5 100644
--- a/clang/unittests/Sema/CodeCompleteTest.cpp
+++ b/clang/unittests/Sema/CodeCompleteTest.cpp
@@ -488,6 +488,7 @@ TEST(PreferredTypeTest, NoCrashOnInvalidTypes) {
     auto y = new decltype(&1)(^);
     // GNU decimal type extension is not supported in clang.
     auto z = new _Decimal128(^);
+    void foo() { (void)(foo)(^); }
   )cpp";
   EXPECT_THAT(collectPreferredTypes(Code), Each("NULL TYPE"));
 }

From 7fc6c60608e416e7f8f5c194768c6dd511449c1b Mon Sep 17 00:00:00 2001
From: Kadir Cetinkaya <kadircet@google.com>
Date: Thu, 18 Feb 2021 13:48:43 +0100
Subject: [PATCH 165/318] [clang][CodeComplete] Ensure there are no crashes
 when completing with ParenListExprs as LHS

Differential Revision: https://reviews.llvm.org/D96950
---
 clang/lib/Sema/SemaCodeComplete.cpp | 31 ++++++++++++++++-------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/clang/lib/Sema/SemaCodeComplete.cpp b/clang/lib/Sema/SemaCodeComplete.cpp
index 40ea0f5d24b3..be04970979b3 100644
--- a/clang/lib/Sema/SemaCodeComplete.cpp
+++ b/clang/lib/Sema/SemaCodeComplete.cpp
@@ -5158,6 +5158,20 @@ class ConceptInfo {
 
   llvm::DenseMap<const IdentifierInfo *, Member> Results;
 };
+
+// If \p Base is ParenListExpr, assume a chain of comma operators and pick the
+// last expr. We expect other ParenListExprs to be resolved to e.g. constructor
+// calls before here. (So the ParenListExpr should be nonempty, but check just
+// in case)
+Expr *unwrapParenList(Expr *Base) {
+  if (auto *PLE = llvm::dyn_cast_or_null<ParenListExpr>(Base)) {
+    if (PLE->getNumExprs() == 0)
+      return nullptr;
+    Base = PLE->getExpr(PLE->getNumExprs() - 1);
+  }
+  return Base;
+}
+
 } // namespace
 
 void Sema::CodeCompleteMemberReferenceExpr(Scope *S, Expr *Base,
@@ -5165,18 +5179,11 @@ void Sema::CodeCompleteMemberReferenceExpr(Scope *S, Expr *Base,
                                            SourceLocation OpLoc, bool IsArrow,
                                            bool IsBaseExprStatement,
                                            QualType PreferredType) {
+  Base = unwrapParenList(Base);
+  OtherOpBase = unwrapParenList(OtherOpBase);
   if (!Base || !CodeCompleter)
     return;
 
-  // Peel off the ParenListExpr by chosing the last one, as they don't have a
-  // predefined type.
-  if (auto *PLE = llvm::dyn_cast<ParenListExpr>(Base))
-    Base = PLE->getExpr(PLE->getNumExprs() - 1);
-  if (OtherOpBase) {
-    if (auto *PLE = llvm::dyn_cast<ParenListExpr>(OtherOpBase))
-      OtherOpBase = PLE->getExpr(PLE->getNumExprs() - 1);
-  }
-
   ExprResult ConvertedBase = PerformMemberExprBaseConversion(Base, IsArrow);
   if (ConvertedBase.isInvalid())
     return;
@@ -5606,14 +5613,10 @@ ProduceSignatureHelp(Sema &SemaRef, Scope *S,
 QualType Sema::ProduceCallSignatureHelp(Scope *S, Expr *Fn,
                                         ArrayRef<Expr *> Args,
                                         SourceLocation OpenParLoc) {
+  Fn = unwrapParenList(Fn);
   if (!CodeCompleter || !Fn)
     return QualType();
 
-  // If we have a ParenListExpr for LHS, peel it off by chosing the last expr.
-  // As ParenListExprs don't have a predefined type.
-  if (auto *PLE = llvm::dyn_cast<ParenListExpr>(Fn))
-    Fn = PLE->getExpr(PLE->getNumExprs() - 1);
-
   // FIXME: Provide support for variadic template functions.
   // Ignore type-dependent call expressions entirely.
   if (Fn->isTypeDependent() || anyNullArguments(Args))

From 1c0a0c727eaeee7d7283f9dabe861e69881764c4 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Tue, 23 Feb 2021 11:12:50 -0800
Subject: [PATCH 166/318] [12.0.0][llvm-symbolizer][test] Fix test broken after
 cherry-pick

See bug https://bugs.llvm.org/show_bug.cgi?id=49227. The cherry-pick 0d4f8a3f364f introduced a test failure, as the test included use of a feature that was only recently added to lit and isn't in the release branch. This patch fixes up the test to manage without this lit change.

Reviewed By: tstellar, MaskRay

Differential Revision: https://reviews.llvm.org/D97272
---
 llvm/test/tools/llvm-symbolizer/output-style-inlined.test | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/test/tools/llvm-symbolizer/output-style-inlined.test b/llvm/test/tools/llvm-symbolizer/output-style-inlined.test
index 1b8e3a2f22fb..103c2afc176e 100644
--- a/llvm/test/tools/llvm-symbolizer/output-style-inlined.test
+++ b/llvm/test/tools/llvm-symbolizer/output-style-inlined.test
@@ -33,17 +33,17 @@ GNU: inctwo
 ## is specified, but a file doesn't exist. Check we report an error.
 
 RUN: llvm-symbolizer --output-style=GNU --obj=%p/Inputs/not.exist 0x1 0x2 --no-inlines 2>&1 \
-RUN:   | FileCheck %s --check-prefix=NOT-EXIST-GNU -DMSG=%errc_ENOENT
+RUN:   | FileCheck %s --check-prefix=NOT-EXIST-GNU
 RUN: llvm-symbolizer --output-style=LLVM --obj=%p/Inputs/not.exist 0x1 0x2 --no-inlines 2>&1 \
-RUN:   | FileCheck %s --check-prefix=NOT-EXIST-LLVM -DMSG=%errc_ENOENT
+RUN:   | FileCheck %s --check-prefix=NOT-EXIST-LLVM
 
-# NOT-EXIST-GNU:      LLVMSymbolizer: error reading file: [[MSG]]
+# NOT-EXIST-GNU:      LLVMSymbolizer: error reading file: {{[Nn]}}o such file or directory
 # NOT-EXIST-GNU-NEXT: ??
 # NOT-EXIST-GNU-NEXT: ??:0
 # NOT-EXIST-GNU-NEXT: ??
 # NOT-EXIST-GNU-NEXT: ??:0
 
-# NOT-EXIST-LLVM:       LLVMSymbolizer: error reading file: [[MSG]]
+# NOT-EXIST-LLVM:       LLVMSymbolizer: error reading file: {{[Nn]}}o such file or directory
 # NOT-EXIST-LLVM-NEXT:  ??
 # NOT-EXIST-LLVM-NEXT:  ??:0:0
 # NOT-EXIST-LLVM-EMPTY:

From eccac5a8aec92c995f0f8ef090ba4142e0334b46 Mon Sep 17 00:00:00 2001
From: Andy Kaylor <andrew.kaylor@intel.com>
Date: Wed, 3 Feb 2021 18:16:04 -0800
Subject: [PATCH 167/318] Add auto-upgrade support for annotation intrinsics

The llvm.ptr.annotation and llvm.var.annotation intrinsics were changed
since the 11.0 release to add an additional parameter. This patch
auto-upgrades IR containing the four-parameter versions of these
intrinsics, adding a null pointer as the fifth argument.

Differential Revision: https://reviews.llvm.org/D95993

(cherry picked from commit 9a827906cb95e7c3ae94627558da67b47ffde249)
---
 llvm/lib/IR/AutoUpgrade.cpp                   |  42 ++++++++++++++++
 llvm/test/Bitcode/upgrade-ptr-annotation.ll   |  45 ++++++++++++++++++
 .../test/Bitcode/upgrade-ptr-annotation.ll.bc | Bin 0 -> 1524 bytes
 llvm/test/Bitcode/upgrade-var-annotation.ll   |  15 ++++++
 .../test/Bitcode/upgrade-var-annotation.ll.bc | Bin 0 -> 1232 bytes
 5 files changed, 102 insertions(+)
 create mode 100644 llvm/test/Bitcode/upgrade-ptr-annotation.ll
 create mode 100644 llvm/test/Bitcode/upgrade-ptr-annotation.ll.bc
 create mode 100644 llvm/test/Bitcode/upgrade-var-annotation.ll
 create mode 100644 llvm/test/Bitcode/upgrade-var-annotation.ll.bc

diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 23e7af6287b6..7d83cf5dcf1d 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -937,6 +937,12 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
             Intrinsic::getDeclaration(F->getParent(), Intrinsic::prefetch, Tys);
         return true;
       }
+    } else if (Name.startswith("ptr.annotation.") && F->arg_size() == 4) {
+      rename(F);
+      NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                        Intrinsic::ptr_annotation,
+                                        F->arg_begin()->getType());
+      return true;
     }
     break;
 
@@ -947,6 +953,16 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
     }
     break;
 
+  case 'v': {
+    if (Name == "var.annotation" && F->arg_size() == 4) {
+      rename(F);
+      NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                        Intrinsic::var_annotation);
+      return true;
+    }
+    break;
+  }
+
   case 'x':
     if (UpgradeX86IntrinsicFunction(F, Name, NewFn))
       return true;
@@ -3730,6 +3746,32 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     CI->eraseFromParent();
     return;
 
+  case Intrinsic::ptr_annotation:
+    // Upgrade from versions that lacked the annotation attribute argument.
+    assert(CI->getNumArgOperands() == 4 &&
+           "Before LLVM 12.0 this intrinsic took four arguments");
+    // Create a new call with an added null annotation attribute argument.
+    NewCall = Builder.CreateCall(
+        NewFn,
+        {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2),
+         CI->getArgOperand(3), Constant::getNullValue(Builder.getInt8PtrTy())});
+    NewCall->takeName(CI);
+    CI->replaceAllUsesWith(NewCall);
+    CI->eraseFromParent();
+    return;
+
+  case Intrinsic::var_annotation:
+    // Upgrade from versions that lacked the annotation attribute argument.
+    assert(CI->getNumArgOperands() == 4 &&
+           "Before LLVM 12.0 this intrinsic took four arguments");
+    // Create a new call with an added null annotation attribute argument.
+    NewCall = Builder.CreateCall(
+        NewFn,
+        {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2),
+         CI->getArgOperand(3), Constant::getNullValue(Builder.getInt8PtrTy())});
+    CI->eraseFromParent();
+    return;
+
   case Intrinsic::x86_xop_vfrcz_ss:
   case Intrinsic::x86_xop_vfrcz_sd:
     NewCall = Builder.CreateCall(NewFn, {CI->getArgOperand(1)});
diff --git a/llvm/test/Bitcode/upgrade-ptr-annotation.ll b/llvm/test/Bitcode/upgrade-ptr-annotation.ll
new file mode 100644
index 000000000000..aeacc6f1a6ce
--- /dev/null
+++ b/llvm/test/Bitcode/upgrade-ptr-annotation.ll
@@ -0,0 +1,45 @@
+; Test upgrade of ptr.annotation intrinsics.
+;
+; RUN: llvm-dis < %s.bc | FileCheck %s
+
+; Unused return values
+; The arguments passed to the intrinisic wouldn't normally be arguments to
+; the function, but that makes it easier to test that they are handled
+; correctly.
+define void @f1(i8* %arg0, i8* %arg1, i8* %arg2, i32 %arg3) {
+;CHECK: @f1(i8* [[ARG0:%.*]], i8* [[ARG1:%.*]], i8* [[ARG2:%.*]], i32 [[ARG3:%.*]])
+  %t0 = call i8* @llvm.ptr.annotation.p0i8(i8* %arg0, i8* %arg1, i8* %arg2, i32 %arg3)
+;CHECK:  call i8* @llvm.ptr.annotation.p0i8(i8* [[ARG0]], i8* [[ARG1]], i8* [[ARG2]], i32 [[ARG3]], i8* null)
+
+  %arg0_p16 = bitcast i8* %arg0 to i16*
+  %t1 = call i16* @llvm.ptr.annotation.p0i16(i16* %arg0_p16, i8* %arg1, i8* %arg2, i32 %arg3)
+;CHECK:  [[ARG0_P16:%.*]] = bitcast
+;CHECK:  call i16* @llvm.ptr.annotation.p0i16(i16* [[ARG0_P16]], i8* [[ARG1]], i8* [[ARG2]], i32 [[ARG3]], i8* null)
+
+  %arg0_p256 = bitcast i8* %arg0 to i256*
+  %t2 = call i256* @llvm.ptr.annotation.p0i256(i256* %arg0_p256, i8* %arg1, i8* %arg2, i32 %arg3)
+;CHECK:  [[ARG0_P256:%.*]] = bitcast
+;CHECK:  call i256* @llvm.ptr.annotation.p0i256(i256* [[ARG0_P256]], i8* [[ARG1]], i8* [[ARG2]], i32 [[ARG3]], i8* null)
+  ret void
+}
+
+; Used return values
+define i16* @f2(i16* %x, i16* %y) {
+  %t0 = call i16* @llvm.ptr.annotation.p0i16(i16* %x, i8* undef, i8* undef, i32 undef)
+  %t1 = call i16* @llvm.ptr.annotation.p0i16(i16* %y, i8* undef, i8* undef, i32 undef)
+  %cmp = icmp ugt i16* %t0, %t1
+  %sel = select i1 %cmp, i16* %t0, i16* %t1
+  ret i16* %sel
+; CHECK:  [[T0:%.*]] = call i16* @llvm.ptr.annotation.p0i16(i16* %x, i8* undef, i8* undef, i32 undef, i8* null)
+; CHECK:  [[T1:%.*]] = call i16* @llvm.ptr.annotation.p0i16(i16* %y, i8* undef, i8* undef, i32 undef, i8* null)
+; CHECK:  %cmp = icmp ugt i16* [[T0]], [[T1]]
+; CHECK:  %sel = select i1 %cmp, i16* [[T0]], i16* [[T1]]
+; CHECK:  ret i16* %sel
+}
+
+declare i8*   @llvm.ptr.annotation.p0i8(i8*, i8*, i8*, i32)
+; CHECK: declare i8*   @llvm.ptr.annotation.p0i8(i8*, i8*, i8*, i32, i8*)
+declare i16*  @llvm.ptr.annotation.p0i16(i16*, i8*, i8*, i32)
+; CHECK: declare i16*   @llvm.ptr.annotation.p0i16(i16*, i8*, i8*, i32, i8*)
+declare i256* @llvm.ptr.annotation.p0i256(i256*, i8*, i8*, i32)
+; CHECK: declare i256*   @llvm.ptr.annotation.p0i256(i256*, i8*, i8*, i32, i8*)
diff --git a/llvm/test/Bitcode/upgrade-ptr-annotation.ll.bc b/llvm/test/Bitcode/upgrade-ptr-annotation.ll.bc
new file mode 100644
index 0000000000000000000000000000000000000000..5db0810ff88515d1fc40abe7fc20275d8cfb83aa
GIT binary patch
literal 1524
zcmZ`(Z%h+s7=QcsPAJzK64#Dva|M=kPVri#UTM3uJ*U-?Sa2U$CRwl4iVNcLXIOAC
zN3Z5^>xUwkbQv)=AtruU(q(AOGz;aBG1nM$ZgCqUG*om34O3^#X5!vf_wR#Ga_>Fw
zd(S=Z@ArG&=k+p^3-uKMr~m*#4OO%Aov*^Jf1ce{+Qc`?;F?JTfDbyQaubll`8xO@
zgVn>HS>+BjZnYnm#Wb&~P`y$4ageWgQ|>)p>K(_e6*Q`QtHjzo<m0AI)mzKOIPN(v
z56z;VSCGuMA@UoUwK=9+Z(g6Ue3;%dOkSFWd5@6*Kq$ak<y%9Bjw(m0#$#|;ilq`2
z<&7W3OJn1WxW5|oDAA*KIb~MiUQJbx692?H)`On#q9*m3=CSH&(zYE`<#+ye5*HTT
zlY|E}=>Sl`yp{3Ar_d0${131Lt`+#LjRNqBsS*34;mHQ=$hRBm)`gM_zZ@J$o;-4)
zwc}Bu1V>@iKPJ;PuaOsY_}fUpte5ZR!M{<;fbY^?4PMM%jN*aj%pJGzzZiE=a5tR>
zfNz2Nj=-=wlqaThM7Toc8e-^}6!-E-e+21Cc4?M@_AZV+<gq%9*Ep3iTuBjON>vOK
zDzggLEaQqRT-ivxmqQMtNWV|yIAnlR^hHFD6!qDcu$2UsqqLorc4-w`^<#DTv|+(S
z+%OrYSYwzv=C~+0<{0~g!g*0}#uSe1YDHfFIoytHX&1ZU$ctP;<N`>a8R>hhf@u%&
z;(Msrr4zf8Vt;aQ!@X7P-Xiw<B(_2+g0m?ipCYowah@eEn~3Q_!(~?eLxQo-GWKL~
zJTK9s3_YZ9j7po}rca`d>k3C&(tN#N<XXgTPUvk%A_1{iC-$Ba^;dRl?*Z+7KbB8m
zOKxqB#8w~+(rlQ|7^ghOYbIiT&^W(H<a35QGGkcRM5mdCF~O1b*E}Z@5D!I=NV2K%
zVM1F_!1%uLGE3Yx85S~zBQj$Fu*tB>s^2~;&{@fmW?WE^CsWQm;~0WdDSoL0G8GR&
zJSZ|?M!J)OdzNr)WfHsT#~x7CnQI<nKKJsCoM8b1@aY|C!8IXN#w0q<*t3%Je8xU2
zL5|L>Z=wrDAgV4XV2h~#c?MevXzwnnS4r(nN_&&kKJsIoIJQJ#aQa|T{p%u@_hXMK
zCij!cu$&@RWkuxdw(CDiG%QwH+H`|y$TAH>Oao*+$Gic~z_YD~XR5TM+YD=k476}b
z#swu$Gxj+tx=hTZ%5HlKM`_t)c&5R+%}W97mLE#1%@+jL-tuD))BGUPZ!T6oB0|&!
z-YwjiL{G>Z<AQ6n&{J1&0Mzh}WS!%JV+<Bl;W{NaVQ~{up}$xX_7{AwRc3}{a-P(g
z5ew>Bw{JV_5M_q<M5DU<5l4^*B&zn86WdZbHz<Sgr+F~!dAjSfccawRb7bRLON~1;
z_q6TSm-Xn#0hJQhPv~)X=z5<U46~JT+18oDPtaKM*&Vf3>Dd6G74NK?Flf(0zgmrg
zXkbzoBbCoxVwyswGB2tAu1vr?a7<a`a*jf?ZEL2sej25|pP_uyK~-k2T2^2QKo|M!
zF0o>GRSYEMI``|v^;X!&uR*U|F}kI|J8TJCJ39If)N;MuwQOhS!Ctm^|G~~$j@nPZ
zVBKPS!3S%-&0?vw)KX*}OIvAc-4<KW)@BU`ZS`zjFj!AhZ5ArDImBA5c3X%Jg)JX&
U`?}fC-i>hcjsMxaqoauRFGh#!<^TWy

literal 0
HcmV?d00001

diff --git a/llvm/test/Bitcode/upgrade-var-annotation.ll b/llvm/test/Bitcode/upgrade-var-annotation.ll
new file mode 100644
index 000000000000..30f692cd8db8
--- /dev/null
+++ b/llvm/test/Bitcode/upgrade-var-annotation.ll
@@ -0,0 +1,15 @@
+; Test upgrade of var.annotation intrinsics.
+;
+; RUN: llvm-dis < %s.bc | FileCheck %s
+
+
+define void @f(i8* %arg0, i8* %arg1, i8* %arg2, i32 %arg3) {
+;CHECK: @f(i8* [[ARG0:%.*]], i8* [[ARG1:%.*]], i8* [[ARG2:%.*]], i32 [[ARG3:%.*]])
+  call void @llvm.var.annotation(i8* %arg0, i8* %arg1, i8* %arg2, i32 %arg3)
+;CHECK:  call void @llvm.var.annotation(i8* [[ARG0]], i8* [[ARG1]], i8* [[ARG2]], i32 [[ARG3]], i8* null)
+  ret void
+}
+
+; Function Attrs: nofree nosync nounwind willreturn
+declare void @llvm.var.annotation(i8*, i8*, i8*, i32)
+; CHECK: declare void @llvm.var.annotation(i8*, i8*, i8*, i32, i8*)
diff --git a/llvm/test/Bitcode/upgrade-var-annotation.ll.bc b/llvm/test/Bitcode/upgrade-var-annotation.ll.bc
new file mode 100644
index 0000000000000000000000000000000000000000..c5f88855fb171b5a787b5637ee62834e30edeca4
GIT binary patch
literal 1232
zcmYLJaZDRk7=NXOyFmBu1f1=-Hg{thvJ7@DgI#HBD7O(n5{vo=CL?lPSD9gObhQ*P
zOG~?5r;RCtKl%qTBjF$as}aqT5kong6`KW@#h^pdhQg*{q;a?`N}}&N7r*3v@4fH6
z``+*SecyL&{L*GqH2|dm0GF0M*7fc;p1!}odb6S@*sVe~MhgH$i;5Bkl%V}#G_X+Z
zlq0R_EF;a98>)!*)lyt<)O_L!YR{Im-Kc1Dljdp;*S%40Hb*<e6{hw~r4%C_H%i=T
z{EKRuZ;#U7tIXzz?s(ntxap(hxhYysBfr;a0MK*5d^C;I?yH(<W<G+{R%9+-UD<st
zRuOS`lT<CZtidmQ3bb9S&RVEJ_j_~zsL@;#%l?A|Yc2T;bRwIAziJYIBTP5^vgP0e
zG5tgRMDJ$#?O%t-6Q5nV-8c9&UXJ7F)(Z&&-CLzqDoyJP10X?VjppMn^vsG&F;1h=
zhzF^a9ObE2*`quckz;K^Y%G9XPK;=G0P&cF`$5>KZ8I*X40q=!4_jKi7Ou&t?P=Z~
zQ`<9vm`}teaBQqY5=CrWRQm&xNK5+6OK>+1b1X5$5?cjWaKc7%#jwdx>x^MpFnZYQ
z*2S=OjkheQZHr-BL~YF!s{Q@g!~k}BK=L9Tl*G6s_GA7!%>S$uxg8Bk@8QyjPVy$C
zu|(*^UIFf9;h#%zmsPvcb5wqg$`tj6K&>*=O31J(l>HRvEot78DC%1>H^XyLwRJ{5
zi5@+ewBA=+ld|@^i;~zYdBw1A01Na>KAq&dDe3RNP3!^U4=2pW;Z`e=qv0-!g03@c
zq>RfB<2{Dj2pKoBR6b|;U1jv>dblLtG8?vLoX7qPq+c2hV1YzW_kNsE7%+ZdTotHC
zjA1inxS}#DfEhzUD0}B>n9Io4ByUHA94y=Nyfuopa_rl36sa_d@Zi{Z9p+7h&TWx!
zcL{Dg;S;ttb<bhU=U!RQ88#7s4!t!Qwl9QhA~F}_Eg9K%D`iQ`C`MbRV_^ix5ULSG
zV6UWqa0l-86OXfH1)A7qiEWyA>V!ii++rcxKFOB-mW6pIe8%#*Ul_yA992*ikqhnj
zf0j9vtfYK$oo~tTEm6J&MZLzq4n9Y9sz)_dQSPlnxx&VK#RP9hlqY%1nmoBf-I+V|
z$f1<d4r6%E!QoR|{qUg^QA^|%f{BMtxStG$u(7&g<^vK!t;F3c&dS_^$~qsm&nW%z
zBMCqmnz4*+K5U&u302!~hHWUhaarjVfd=3ZnyWOasH!AL>&Q|Q9=fJh&Xn(7YXwt|
z2S9bE{gOsQ>c4)OY;|uSyCNg>^By!q@@NEvm0v|s`cOk3L90sj%%WoO3=aA)HTVT@
zgD^BS>=S$!hlfn222%q|Hwv7YGdG@Yb~T?jyIjppLZi#o#Ifg1toxK(FqtjQZqDs7
XeJH-~72Fr<k(2uWoiq#%D*pck%7K{T

literal 0
HcmV?d00001


From 06e5dec59e0b53c5a99fc8d2ff30960b24c4f1b6 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Tue, 23 Feb 2021 15:57:13 -0800
Subject: [PATCH 168/318] Fix test failures after
 a92ceea91116e7b95d23eff634507fa2cff86ef2

---
 llvm/test/tools/llvm-cov/branch-c-general.test    | 12 ++++++------
 llvm/test/tools/llvm-cov/branch-logical-mixed.cpp |  4 ++--
 llvm/test/tools/llvm-cov/branch-noShowBranch.test |  4 ++--
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/llvm/test/tools/llvm-cov/branch-c-general.test b/llvm/test/tools/llvm-cov/branch-c-general.test
index bbebdd19fbae..33c12d611992 100644
--- a/llvm/test/tools/llvm-cov/branch-c-general.test
+++ b/llvm/test/tools/llvm-cov/branch-c-general.test
@@ -118,18 +118,18 @@
 // REPORT-NEXT: ---
 // REPORT-NEXT: simple_loops                      8       0 100.00%         9       0 100.00%         6       0 100.00%
 // REPORT-NEXT: conditionals                     24       0 100.00%        15       0 100.00%        16       2  87.50%
-// REPORT-NEXT: early_exits                      20       4  80.00%        25       3  88.00%        16       6  62.50%
-// REPORT-NEXT: jumps                            39      12  69.23%        48       4  91.67%        26       9  65.38%
-// REPORT-NEXT: switches                         28       5  82.14%        38       5  86.84%        30       9  70.00%
+// REPORT-NEXT: early_exits                      20       4  80.00%        25       2  92.00%        16       6  62.50%
+// REPORT-NEXT: jumps                            39      12  69.23%        48       2  95.83%        26       9  65.38%
+// REPORT-NEXT: switches                         28       5  82.14%        38       4  89.47%        30       9  70.00%
 // REPORT-NEXT: big_switch                       25       1  96.00%        32       0 100.00%        30       6  80.00%
 // REPORT-NEXT: boolean_operators                16       0 100.00%        13       0 100.00%        22       2  90.91%
 // REPORT-NEXT: boolop_loops                     19       0 100.00%        14       0 100.00%        16       2  87.50%
-// REPORT-NEXT: conditional_operator              4       2  50.00%         8       1  87.50%         4       2  50.00%
+// REPORT-NEXT: conditional_operator              4       2  50.00%         8       0 100.00%         4       2  50.00%
 // REPORT-NEXT: do_fallthrough                    9       0 100.00%        12       0 100.00%         6       0 100.00%
 // REPORT-NEXT: main                              1       0 100.00%        16       0 100.00%         0       0   0.00%
 // REPORT-NEXT: c-general.c:static_func           4       0 100.00%         4       0 100.00%         2       0 100.00%
 // REPORT-NEXT: ---
-// REPORT-NEXT: TOTAL                           197      24  87.82%       234      13 94.44%       174      38  78.16%
+// REPORT-NEXT: TOTAL                           197      24  87.82%       234       8 96.58%       174      38  78.16%
 
 // Test file-level report.
 // RUN: llvm-profdata merge %S/Inputs/branch-c-general.proftext -o %t.profdata
@@ -157,7 +157,7 @@
 // HTML-INDEX: <td class='column-entry-green'>
 // HTML-INDEX: 100.00% (12/12)
 // HTML-INDEX: <td class='column-entry-yellow'>
-// HTML-INDEX: 94.44% (221/234)
+// HTML-INDEX: 96.58% (226/234)
 // HTML-INDEX: <td class='column-entry-yellow'>
 // HTML-INDEX: 87.82% (173/197)
 // HTML-INDEX: <td class='column-entry-red'>
diff --git a/llvm/test/tools/llvm-cov/branch-logical-mixed.cpp b/llvm/test/tools/llvm-cov/branch-logical-mixed.cpp
index 107ed7778015..f5f787112446 100644
--- a/llvm/test/tools/llvm-cov/branch-logical-mixed.cpp
+++ b/llvm/test/tools/llvm-cov/branch-logical-mixed.cpp
@@ -84,7 +84,7 @@ int main(int argc, char *argv[])
 
 // REPORT: Name                        Regions    Miss   Cover     Lines    Miss   Cover  Branches    Miss   Cover
 // REPORT-NEXT: ---
-// REPORT-NEXT: _Z4funcii                        77       9  88.31%        68       10  85.29%        80      32  60.00%
+// REPORT-NEXT: _Z4funcii                        77       9  88.31%        68       3  95.59%        80      32  60.00%
 // REPORT-NEXT: main                              1       0 100.00%         5       0 100.00%         0       0   0.00%
 // REPORT-NEXT: ---
-// REPORT-NEXT: TOTAL                            78       9  88.46%        73       10  86.30%        80      32  60.00%
+// REPORT-NEXT: TOTAL                            78       9  88.46%        73       3  95.89%        80      32  60.00%
diff --git a/llvm/test/tools/llvm-cov/branch-noShowBranch.test b/llvm/test/tools/llvm-cov/branch-noShowBranch.test
index a8f12d698933..79069b2f07bf 100644
--- a/llvm/test/tools/llvm-cov/branch-noShowBranch.test
+++ b/llvm/test/tools/llvm-cov/branch-noShowBranch.test
@@ -20,6 +20,6 @@
 // REPORT-NOT: do_fallthrough                    9       0 100.00%        12       0 100.00%         6       0 100.00%
 // REPORT-NOT: main                              1       0 100.00%        16       0 100.00%         0       0   0.00%
 // REPORT-NOT: c-general.c:static_func           4       0 100.00%         4       0 100.00%         2       0 100.00%
-// REPORT: TOTAL                           197      24  87.82%       234       13  94.44%
-// REPORT-NOT: TOTAL                           197      24  87.82%       234       13  94.44%       174      38  78.16%
+// REPORT: TOTAL                           197      24  87.82%       234       8  96.58%
+// REPORT-NOT: TOTAL                           197      24  87.82%       234       8  96.58%       174      38  78.16%
 

From d56d2c8863b6ae3637b6261c32ea9479d8e1e2d6 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Wed, 27 Jan 2021 13:08:24 -0500
Subject: [PATCH 169/318] [libc++] Fix extern template test failing on Windows

See https://reviews.llvm.org/D94718#2521489 for details.

(cherry picked from commit 90407b16b1d3e38f1360b6a24ceab801ab9cefc1)
---
 libcxx/test/libcxx/debug/extern-templates.sh.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/test/libcxx/debug/extern-templates.sh.cpp b/libcxx/test/libcxx/debug/extern-templates.sh.cpp
index d5039d4f3029..b2ed6a63d630 100644
--- a/libcxx/test/libcxx/debug/extern-templates.sh.cpp
+++ b/libcxx/test/libcxx/debug/extern-templates.sh.cpp
@@ -15,7 +15,7 @@
 // UNSUPPORTED: libcpp-has-no-localization
 
 // RUN: %{cxx} %{flags} %{compile_flags} %s %{link_flags} -fPIC -DTU1 -D_LIBCPP_DEBUG=1 -fvisibility=hidden -shared -o %t.lib
-// RUN: %{cxx} %{flags} %{compile_flags} %s %t.lib %{link_flags} -fPIC -DTU2 -D_LIBCPP_DEBUG=1 -fvisibility=hidden -o %t.exe
+// RUN: cd %T && %{cxx} %{flags} %{compile_flags} %s %basename_t.tmp.lib %{link_flags} -fPIC -DTU2 -D_LIBCPP_DEBUG=1 -fvisibility=hidden -o %t.exe
 // RUN: %{exec} %t.exe
 
 #include <cassert>

From 4918a3d138b907a571f496661b5367e090e1e8bb Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Thu, 28 Jan 2021 10:46:22 -0500
Subject: [PATCH 170/318] [libc++] Fix extern-templates.sh.cpp test on Linux

(cherry picked from commit bf5941afcda3ac6570ba25165758869287491e0d)
---
 libcxx/test/libcxx/debug/extern-templates.sh.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/test/libcxx/debug/extern-templates.sh.cpp b/libcxx/test/libcxx/debug/extern-templates.sh.cpp
index b2ed6a63d630..0e19895ba8f0 100644
--- a/libcxx/test/libcxx/debug/extern-templates.sh.cpp
+++ b/libcxx/test/libcxx/debug/extern-templates.sh.cpp
@@ -15,7 +15,7 @@
 // UNSUPPORTED: libcpp-has-no-localization
 
 // RUN: %{cxx} %{flags} %{compile_flags} %s %{link_flags} -fPIC -DTU1 -D_LIBCPP_DEBUG=1 -fvisibility=hidden -shared -o %t.lib
-// RUN: cd %T && %{cxx} %{flags} %{compile_flags} %s %basename_t.tmp.lib %{link_flags} -fPIC -DTU2 -D_LIBCPP_DEBUG=1 -fvisibility=hidden -o %t.exe
+// RUN: cd %T && %{cxx} %{flags} %{compile_flags} %s ./%basename_t.tmp.lib %{link_flags} -fPIC -DTU2 -D_LIBCPP_DEBUG=1 -fvisibility=hidden -o %t.exe
 // RUN: %{exec} %t.exe
 
 #include <cassert>

From e0e6b1e39e7e402cd74a8bf98a2728efbe38310e Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 24 Feb 2021 09:19:23 -0800
Subject: [PATCH 171/318] ReleaseNotes: add lld/ELF notes

Differential Revision: https://reviews.llvm.org/D97113
---
 lld/docs/ReleaseNotes.rst | 68 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 66 insertions(+), 2 deletions(-)

diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index 7c1cbc4a4c4b..24ed23bb2b7d 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -24,13 +24,77 @@ Non-comprehensive list of changes in this release
 ELF Improvements
 ----------------
 
-* ``--error-handling-script`` is added to allow for user-defined handlers upon
+* ``--dependency-file`` has been added. (Similar to ``cc -M -MF``.)
+  (`D82437 <https://reviews.llvm.org/D82437>`_)
+* ``--error-handling-script`` has been added to allow for user-defined handlers upon
   missing libraries. (`D87758 <https://reviews.llvm.org/D87758>`_)
+* ``--exclude-libs`` can now localize defined version symbols and bitcode referenced libcall symbols.
+  (`D94280 <https://reviews.llvm.org/D94280>`_)
+* ``--gdb-index`` now works with DWARF v5 and ``--icf={safe,all}``.
+  (`D85579 <https://reviews.llvm.org/D85579>`_)
+  (`D89751 <https://reviews.llvm.org/D89751>`_)
+* ``--gdb-index --emit-relocs`` can now be used together.
+  (`D94354 <https://reviews.llvm.org/D94354>`_)
+* ``--icf={safe,all}`` conservatively no longer fold text sections with LSDA.
+  Previously ICF on ``-fexceptions`` code could be unsafe.
+  (`D84610 <https://reviews.llvm.org/D84610>`_)
+* ``--icf={safe,all}`` can now fold two sections with relocations referencing aliased symbols.
+  (`D88830 <https://reviews.llvm.org/D88830>`_)
+* ``--lto-pseudo-probe-for-profiling`` has been added.
+  (`D95056 <https://reviews.llvm.org/D95056>`_)
+* ``--no-lto-whole-program-visibility`` has been added.
+  (`D92060 <https://reviews.llvm.org/D92060>`_)
+* ``--oformat-binary`` has been fixed to respect LMA.
+  (`D85086 <https://reviews.llvm.org/D85086>`_)
+* ``--reproduce`` includes ``--lto-sample-profile``, ``--just-symbols``, ``--call-graph-ordering-file``, ``--retain-symbols-file`` files.
+* ``-r --gc-sections`` is now supported.
+  (`D84131 <https://reviews.llvm.org/D84131>`_)
+* A ``-u`` specified symbol will no longer change the binding to ``STB_WEAK``.
+  (`D88945 <https://reviews.llvm.org/D88945>`_)
+* ``--wrap`` support has been improved.
+  + If ``foo`` is not referenced, there is no longer an undefined symbol ``__wrap_foo``.
+  + If ``__real_foo`` is not referenced, there is no longer an undefined symbol ``foo``.
+* ``SHF_LINK_ORDER`` sections can now have zero ``sh_link`` values.
+* ``SHF_LINK_ORDER`` and non-``SHF_LINK_ORDER`` sections can now be mixed within an input section description.
+  (`D84001 <https://reviews.llvm.org/D84001>`_)
+* ``LOG2CEIL`` is now supported in linker scripts.
+  (`D84054 <https://reviews.llvm.org/D84054>`_)
+* ``DEFINED`` has been fixed to check whether the symbol is defined.
+  (`D83758 <https://reviews.llvm.org/D83758>`_)
+* An input section description may now have multiple ``SORT_*``.
+  The matched sections are ordered by radix sort with the keys being ``(SORT*, --sort-section, input order)``.
+  (`D91127 <https://reviews.llvm.org/D91127>`_)
+* Users can now provide a GNU style linker script to convert ``.ctors`` into ``.init_array``.
+  (`D91187 <https://reviews.llvm.org/D91187>`_)
+* An empty output section can now be discarded even if it is assigned to a program header.
+  (`D92301 <https://reviews.llvm.org/D92301>`_)
+* Non-``SHF_ALLOC`` sections now have larger file offsets than ``SHF_ALLOC`` sections.
+  (`D85867 <https://reviews.llvm.org/D85867>`_)
+* Some symbol versioning improvements.
+  + Defined ``foo@@v1`` now resolve undefined ``foo@v1`` (`D92259 <https://reviews.llvm.org/D92259>`_)
+  + Undefined ``foo@v1`` now gets an error (`D92260 <https://reviews.llvm.org/D92260>`_)
+* The AArch64 port now has support for ``STO_AARCH64_VARIANT_PCS`` and ``DT_AARCH64_VARIANT_PCS``.
+  (`D93045 <https://reviews.llvm.org/D93045>`_)
+* The AArch64 port now has support for ``R_AARCH64_LD64_GOTPAGE_LO15``.
+* The PowerPC64 port now detects missing R_PPC64_TLSGD/R_PPC64_TLSLD and disables TLS relaxation.
+  This allows linking with object files produced by very old IBM XL compilers.
+  (`D92959 <https://reviews.llvm.org/D92959>`_)
+* Many PowerPC PC-relative relocations are now supported.
+* ``R_PPC_ADDR24`` and ``R_PPC64_ADDR16_HIGH`` are now supported.
+* powerpcle is now supported. Tested with FreeBSD loader and freestanding.
+  (`D93917 <https://reviews.llvm.org/D93917>`_)
+* RISC-V: the first ``SHT_RISCV_ATTRIBUTES`` section is now retained.
+  (`D86309 <https://reviews.llvm.org/D86309>`_)
+* LTO pipeline now defaults to the new PM if the CMake variable ``ENABLE_EXPERIMENTAL_NEW_PASS_MANAGER`` is on.
+  (`D92885 <https://reviews.llvm.org/D92885>`_)
 
 Breaking changes
 ----------------
 
-* ...
+* A COMMON symbol can now cause the fetch of an archive providing a ``STB_GLOBAL`` definition.
+  This behavior follows GNU ld newer than December 1999.
+  If you see ``duplicate symbol`` errors with the new behavior, check out `PR49226 <https://bugs.llvm.org//show_bug.cgi?id=49226>`_.
+  (`D86142 <https://reviews.llvm.org/D86142>`_)
 
 COFF Improvements
 -----------------

From 98f06b16a313ece593f5711778d7da9037f3a2ef Mon Sep 17 00:00:00 2001
From: Pavel Iliin <Pavel.Iliin@arm.com>
Date: Thu, 25 Feb 2021 22:51:09 +0000
Subject: [PATCH 172/318] [AArch64][Docs] Release notes 12.x on outline atomics

Description for AArch64 -moutline-atomics, -mno-outline-atomics
options added to release notes.

Differential Revision: https://reviews.llvm.org/D97510
---
 clang/docs/ReleaseNotes.rst | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index a43cc33988ab..64f737ff488f 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -73,6 +73,15 @@ New Compiler Flags
 
 - ...
 
+- AArch64 options ``-moutline-atomics``, ``-mno-outline-atomics`` to enable
+  and disable calls to helper functions implementing atomic operations. These
+  out-of-line helpers like '__aarch64_cas8_relax' will detect at runtime
+  AArch64 Large System Extensions (LSE) availability and either use their
+  atomic instructions, or falls back to LL/SC loop. These options do not apply
+  if the compilation target supports LSE. Atomic instructions are used directly
+  in that case. The option's behaviour mirrors GCC, the helpers are implemented
+  both in compiler-rt and libgcc.
+
 - -fpch-codegen and -fpch-debuginfo generate shared code and/or debuginfo
   for contents of a precompiled header in a separate object file. This object
   file needs to be linked in, but its contents do not need to be generated

From c637d4d136fd476d4a7418f5ecb76b80bcb6f8fc Mon Sep 17 00:00:00 2001
From: Shilei Tian <tianshilei1992@gmail.com>
Date: Tue, 23 Feb 2021 13:20:13 -0500
Subject: [PATCH 173/318] [OpenMP][NVPTX] Fixed a compilation error in
 deviceRTLs caused by unsupported feature in release verion of LLVM

`ptx71` is not supported in release version of LLVM yet. As a result,
the support of CUDA 11.2 and CUDA 11.1 caused a compilation error as mentioned
in D97004. Since the support in D97004 is just a WA for releease, and we'll not
use it in the near future, using `ptx70` for CUDA 11 is feasible.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D97195

(cherry picked from commit f6c2984a090e78947f75e096d43b476bf2ae73eb)
---
 openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
index 5478cd3f6aea..806a887cc2d8 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -153,7 +153,7 @@ add_custom_target(omptarget-nvptx-bc)
 # This map is from clang/lib/Driver/ToolChains/Cuda.cpp.
 # The last element is the default case.
 set(cuda_version_list 112 111 110 102 101 100 92 91 90 80)
-set(ptx_feature_list 71 71 70 65 64 63 61 61 60 42)
+set(ptx_feature_list 70 70 70 65 64 63 61 61 60 42)
 # The following two lines of ugly code is not needed when the minimal CMake
 # version requirement is 3.17+.
 list(LENGTH cuda_version_list num_version_supported)

From 692808e5af8338f5a109a64b5b9d75d05ec6f590 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Sun, 28 Feb 2021 10:17:10 -0500
Subject: [PATCH 174/318] [InstCombine] avoid infinite loop in demanded bits
 for select

https://llvm.org/PR49205
(cherry picked from commit 9502061bcc86982641772f45b7e7a0eb7437f054)
---
 .../InstCombineSimplifyDemanded.cpp           |  8 +++-
 .../InstCombine/select-imm-canon.ll           | 38 +++++++++++++++++++
 2 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index c265516213aa..16efe863779a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -345,10 +345,14 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         return false;
 
       // Get the constant out of the ICmp, if there is one.
+      // Only try this when exactly 1 operand is a constant (if both operands
+      // are constant, the icmp should eventually simplify). Otherwise, we may
+      // invert the transform that reduces set bits and infinite-loop.
+      Value *X;
       const APInt *CmpC;
       ICmpInst::Predicate Pred;
-      if (!match(I->getOperand(0), m_c_ICmp(Pred, m_APInt(CmpC), m_Value())) ||
-          CmpC->getBitWidth() != SelC->getBitWidth())
+      if (!match(I->getOperand(0), m_ICmp(Pred, m_Value(X), m_APInt(CmpC))) ||
+          isa<Constant>(X) || CmpC->getBitWidth() != SelC->getBitWidth())
         return ShrinkDemandedConstant(I, OpNo, DemandedMask);
 
       // If the constant is already the same as the ICmp, leave it as-is.
diff --git a/llvm/test/Transforms/InstCombine/select-imm-canon.ll b/llvm/test/Transforms/InstCombine/select-imm-canon.ll
index e230b3b92777..fec6d693954a 100644
--- a/llvm/test/Transforms/InstCombine/select-imm-canon.ll
+++ b/llvm/test/Transforms/InstCombine/select-imm-canon.ll
@@ -87,3 +87,41 @@ define i8 @original_logical(i32 %A, i32 %B) {
   %conv7 = trunc i32 %spec.select.i to i8
   ret i8 %conv7
 }
+
+; This would infinite loop because we have potentially opposing
+; constant transforms on degenerate (unsimplified) cmps.
+
+define i32 @PR49205(i32 %t0, i1 %b) {
+; CHECK-LABEL: @PR49205(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    br i1 [[B:%.*]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    br label [[FOR_COND]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i32 1
+;
+entry:
+  br label %for.cond
+
+for.cond:
+  %s = phi i32 [ 7, %entry ], [ %add, %for.body ]
+  br i1 %b, label %for.body, label %for.end
+
+for.body:
+  %div = add i32 %t0, undef
+  %add = add nsw i32 %div, 1
+  br label %for.cond
+
+for.end:
+  %cmp6 = icmp ne i32 %s, 4
+  %conv = zext i1 %cmp6 to i32
+  %and7 = and i32 %s, %conv
+  %sub = sub i32 %s, %and7
+  %cmp9 = icmp ne i32 %sub, 4
+  %conv10 = zext i1 %cmp9 to i32
+  %sub11 = sub i32 %conv10, %sub
+  %and = and i32 %sub11, 1
+  ret i32 %and
+}

From f73ba0f3582ba33984ad996c124d106a9737cd90 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Sat, 27 Feb 2021 09:09:03 -0500
Subject: [PATCH 175/318] [SimplifyCFG] avoid illegal phi with both poison and
 undef

In the example based on:
https://llvm.org/PR49218
...we are crashing because poison is a subclass of undef, so we merge blocks and create:

PHI node has multiple entries for the same basic block with different incoming values!
  %k3 = phi i64 [ poison, %entry ], [ %k3, %g ], [ undef, %entry ]

If both poison and undef values are incoming, we soften the poison values to undef.

Differential Revision: https://reviews.llvm.org/D97495

(cherry picked from commit 356cdabd3a9e0ff919ea2c1a35c8706ecb915297)
---
 llvm/lib/Transforms/Utils/Local.cpp           |  24 ++-
 .../Transforms/SimplifyCFG/poison-merge.ll    | 200 ++++++++++++++++++
 2 files changed, 223 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/SimplifyCFG/poison-merge.ll

diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index d055f3dd3084..ae26058c210c 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -918,6 +918,7 @@ static void gatherIncomingValuesToPhi(PHINode *PN,
 /// \param IncomingValues A map from block to value.
 static void replaceUndefValuesInPhi(PHINode *PN,
                                     const IncomingValueMap &IncomingValues) {
+  SmallVector<unsigned> TrueUndefOps;
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
     Value *V = PN->getIncomingValue(i);
 
@@ -925,10 +926,31 @@ static void replaceUndefValuesInPhi(PHINode *PN,
 
     BasicBlock *BB = PN->getIncomingBlock(i);
     IncomingValueMap::const_iterator It = IncomingValues.find(BB);
-    if (It == IncomingValues.end()) continue;
 
+    // Keep track of undef/poison incoming values. Those must match, so we fix
+    // them up below if needed.
+    // Note: this is conservatively correct, but we could try harder and group
+    // the undef values per incoming basic block.
+    if (It == IncomingValues.end()) {
+      TrueUndefOps.push_back(i);
+      continue;
+    }
+
+    // There is a defined value for this incoming block, so map this undef
+    // incoming value to the defined value.
     PN->setIncomingValue(i, It->second);
   }
+
+  // If there are both undef and poison values incoming, then convert those
+  // values to undef. It is invalid to have different values for the same
+  // incoming block.
+  unsigned PoisonCount = count_if(TrueUndefOps, [&](unsigned i) {
+    return isa<PoisonValue>(PN->getIncomingValue(i));
+  });
+  if (PoisonCount != 0 && PoisonCount != TrueUndefOps.size()) {
+    for (unsigned i : TrueUndefOps)
+      PN->setIncomingValue(i, UndefValue::get(PN->getType()));
+  }
 }
 
 /// Replace a value flowing from a block to a phi with
diff --git a/llvm/test/Transforms/SimplifyCFG/poison-merge.ll b/llvm/test/Transforms/SimplifyCFG/poison-merge.ll
new file mode 100644
index 000000000000..93d9b0b299a6
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/poison-merge.ll
@@ -0,0 +1,200 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -keep-loops=0 < %s | FileCheck %s
+
+; Merge 2 undefined incoming values.
+
+define i32 @undef_merge(i32 %x) {
+; CHECK-LABEL: @undef_merge(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i32 [[X:%.*]], label [[EXIT:%.*]] [
+; CHECK-NEXT:    i32 4, label [[G:%.*]]
+; CHECK-NEXT:    i32 12, label [[G]]
+; CHECK-NEXT:    ]
+; CHECK:       g:
+; CHECK-NEXT:    [[K3:%.*]] = phi i64 [ undef, [[ENTRY:%.*]] ], [ [[K3]], [[G]] ], [ undef, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[G]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  switch i32 %x, label %exit [
+  i32 4, label %loop
+  i32 12, label %g
+  ]
+
+loop:
+  %k2 = phi i64 [ %k3, %g ], [ undef, %entry ]
+  br label %g
+
+g:
+  %k3 = phi i64 [ %k2, %loop ], [ undef, %entry ]
+  br label %loop
+
+exit:
+  ret i32 undef
+}
+
+; Merge 2 poison incoming values.
+
+define i32 @poison_merge(i32 %x) {
+; CHECK-LABEL: @poison_merge(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i32 [[X:%.*]], label [[EXIT:%.*]] [
+; CHECK-NEXT:    i32 4, label [[G:%.*]]
+; CHECK-NEXT:    i32 12, label [[G]]
+; CHECK-NEXT:    ]
+; CHECK:       g:
+; CHECK-NEXT:    [[K3:%.*]] = phi i64 [ poison, [[ENTRY:%.*]] ], [ [[K3]], [[G]] ], [ poison, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[G]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  switch i32 %x, label %exit [
+  i32 4, label %loop
+  i32 12, label %g
+  ]
+
+loop:
+  %k2 = phi i64 [ %k3, %g ], [ poison, %entry ]
+  br label %g
+
+g:
+  %k3 = phi i64 [ %k2, %loop ], [ poison, %entry ]
+  br label %loop
+
+exit:
+  ret i32 undef
+}
+
+; Merge equal defined incoming values.
+
+define i32 @defined_merge(i32 %x) {
+; CHECK-LABEL: @defined_merge(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i32 [[X:%.*]], label [[EXIT:%.*]] [
+; CHECK-NEXT:    i32 4, label [[G:%.*]]
+; CHECK-NEXT:    i32 12, label [[G]]
+; CHECK-NEXT:    ]
+; CHECK:       g:
+; CHECK-NEXT:    [[K3:%.*]] = phi i64 [ 42, [[ENTRY:%.*]] ], [ [[K3]], [[G]] ], [ 42, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[G]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  switch i32 %x, label %exit [
+  i32 4, label %loop
+  i32 12, label %g
+  ]
+
+loop:
+  %k2 = phi i64 [ %k3, %g ], [ 42, %entry ]
+  br label %g
+
+g:
+  %k3 = phi i64 [ %k2, %loop ], [ 42, %entry ]
+  br label %loop
+
+exit:
+  ret i32 undef
+}
+
+; Merge defined and undef incoming values.
+
+define i32 @defined_and_undef_merge(i32 %x) {
+; CHECK-LABEL: @defined_and_undef_merge(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i32 [[X:%.*]], label [[EXIT:%.*]] [
+; CHECK-NEXT:    i32 4, label [[G:%.*]]
+; CHECK-NEXT:    i32 12, label [[G]]
+; CHECK-NEXT:    ]
+; CHECK:       g:
+; CHECK-NEXT:    [[K3:%.*]] = phi i64 [ 42, [[ENTRY:%.*]] ], [ [[K3]], [[G]] ], [ 42, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[G]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  switch i32 %x, label %exit [
+  i32 4, label %loop
+  i32 12, label %g
+  ]
+
+loop:
+  %k2 = phi i64 [ %k3, %g ], [ undef, %entry ]
+  br label %g
+
+g:
+  %k3 = phi i64 [ %k2, %loop ], [ 42, %entry ]
+  br label %loop
+
+exit:
+  ret i32 undef
+}
+
+; Merge defined and poison incoming values.
+
+define i32 @defined_and_poison_merge(i32 %x) {
+; CHECK-LABEL: @defined_and_poison_merge(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i32 [[X:%.*]], label [[EXIT:%.*]] [
+; CHECK-NEXT:    i32 4, label [[G:%.*]]
+; CHECK-NEXT:    i32 12, label [[G]]
+; CHECK-NEXT:    ]
+; CHECK:       g:
+; CHECK-NEXT:    [[K3:%.*]] = phi i64 [ 42, [[ENTRY:%.*]] ], [ [[K3]], [[G]] ], [ 42, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[G]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  switch i32 %x, label %exit [
+  i32 4, label %loop
+  i32 12, label %g
+  ]
+
+loop:
+  %k2 = phi i64 [ %k3, %g ], [ poison, %entry ]
+  br label %g
+
+g:
+  %k3 = phi i64 [ %k2, %loop ], [ 42, %entry ]
+  br label %loop
+
+exit:
+  ret i32 undef
+}
+
+; Do not crash trying to merge poison and undef into a single phi.
+
+define i32 @PR49218(i32 %x) {
+; CHECK-LABEL: @PR49218(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i32 [[X:%.*]], label [[EXIT:%.*]] [
+; CHECK-NEXT:    i32 4, label [[G:%.*]]
+; CHECK-NEXT:    i32 12, label [[G]]
+; CHECK-NEXT:    ]
+; CHECK:       g:
+; CHECK-NEXT:    [[K3:%.*]] = phi i64 [ undef, [[ENTRY:%.*]] ], [ [[K3]], [[G]] ], [ undef, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[G]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  switch i32 %x, label %exit [
+  i32 4, label %loop
+  i32 12, label %g
+  ]
+
+loop:
+  %k2 = phi i64 [ %k3, %g ], [ undef, %entry ]
+  br label %g
+
+g:
+  %k3 = phi i64 [ %k2, %loop ], [ poison, %entry ]
+  br label %loop
+
+exit:
+  ret i32 undef
+}

From 344216979213d841b72e44891871c031db622f5d Mon Sep 17 00:00:00 2001
From: Richard Smith <richard@metafoo.co.uk>
Date: Mon, 1 Mar 2021 12:17:10 -0800
Subject: [PATCH 176/318] Revert "[c++20] Mark class type NTTPs as done and
 start defining the feature test macro."

Some of the parts of this work were reverted; stop defining the feature
test macro for now.

This reverts commit b4c63ef6dd90dba9af26a111c9a78b121c5284b1.

(cherry picked from commit 564f5b0734bd5d265a0046e5ca9d08ae5bc303eb)
---
 clang/lib/Frontend/InitPreprocessor.cpp | 2 +-
 clang/test/Lexer/cxx-features.cpp       | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index d47ad1b74649..c64a912ce919 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -565,7 +565,7 @@ static void InitializeCPlusPlusFeatureTestMacros(const LangOptions &LangOpts,
     Builder.defineMacro("__cpp_aggregate_bases", "201603L");
     Builder.defineMacro("__cpp_structured_bindings", "201606L");
     Builder.defineMacro("__cpp_nontype_template_args",
-                        LangOpts.CPlusPlus20 ? "201911L" : "201411L");
+                        "201411L"); // (not latest)
     Builder.defineMacro("__cpp_fold_expressions", "201603L");
     Builder.defineMacro("__cpp_guaranteed_copy_elision", "201606L");
     Builder.defineMacro("__cpp_nontype_template_parameter_auto", "201606L");
diff --git a/clang/test/Lexer/cxx-features.cpp b/clang/test/Lexer/cxx-features.cpp
index f57faed4ed90..2f46f354ee83 100644
--- a/clang/test/Lexer/cxx-features.cpp
+++ b/clang/test/Lexer/cxx-features.cpp
@@ -181,7 +181,8 @@
 #error "wrong value for __cpp_structured_bindings"
 #endif
 
-#if check(nontype_template_args, 0, 0, 0, 201411, 201911, 201911)
+#if check(nontype_template_args, 0, 0, 0, 201411, 201411, 201411)
+// FIXME: 201911 in C++20
 #error "wrong value for __cpp_nontype_template_args"
 #endif
 

From 9760b282ff03ef581d51b3d74d5b33d09b463272 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Sun, 28 Feb 2021 11:23:46 -0800
Subject: [PATCH 177/318] [DAGCombiner][X86] Don't peek through ANDs on the
 shift amount in matchRotateSub when called from MatchFunnelPosNeg.

Peeking through AND is only valid if the input to both shifts is
the same. If the inputs are different, then the original pattern
ORs the two values when the masked shift amount is 0. This is ok
if the values are the same since the OR would be a NOP which is
why its ok for rotate.

Fixes PR49365 and reverts PR34641

Differential Revision: https://reviews.llvm.org/D97637

(cherry picked from commit 5de09ef02e24d234d9fc0cd1c6dfe18a1bb784b0)
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 15 +++++--
 llvm/test/CodeGen/X86/shift-double.ll         | 44 ++++++++++++-------
 2 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 89670d708264..6a6f83827f72 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6517,8 +6517,11 @@ static SDValue extractShiftForRotate(SelectionDAG &DAG, SDValue OppShift,
 // reduces to a rotate in direction shift2 by Pos or (equivalently) a rotate
 // in direction shift1 by Neg.  The range [0, EltSize) means that we only need
 // to consider shift amounts with defined behavior.
+//
+// The IsRotate flag should be set when the LHS of both shifts is the same.
+// Otherwise if matching a general funnel shift, it should be clear.
 static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize,
-                           SelectionDAG &DAG) {
+                           SelectionDAG &DAG, bool IsRotate) {
   // If EltSize is a power of 2 then:
   //
   //  (a) (Pos == 0 ? 0 : EltSize - Pos) == (EltSize - Pos) & (EltSize - 1)
@@ -6550,8 +6553,11 @@ static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize,
   // always invokes undefined behavior for 32-bit X.
   //
   // Below, Mask == EltSize - 1 when using [A] and is all-ones otherwise.
+  //
+  // NOTE: We can only do this when matching an AND and not a general
+  // funnel shift.
   unsigned MaskLoBits = 0;
-  if (Neg.getOpcode() == ISD::AND && isPowerOf2_64(EltSize)) {
+  if (IsRotate && Neg.getOpcode() == ISD::AND && isPowerOf2_64(EltSize)) {
     if (ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(1))) {
       KnownBits Known = DAG.computeKnownBits(Neg.getOperand(0));
       unsigned Bits = Log2_64(EltSize);
@@ -6641,7 +6647,8 @@ SDValue DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
   //          (srl x, (*ext y))) ->
   //   (rotr x, y) or (rotl x, (sub 32, y))
   EVT VT = Shifted.getValueType();
-  if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits(), DAG)) {
+  if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits(), DAG,
+                     /*IsRotate*/ true)) {
     bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT);
     return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, Shifted,
                        HasPos ? Pos : Neg);
@@ -6670,7 +6677,7 @@ SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
   // fold (or (shl x0, (*ext (sub 32, y))),
   //          (srl x1, (*ext y))) ->
   //   (fshr x0, x1, y) or (fshl x0, x1, (sub 32, y))
-  if (matchRotateSub(InnerPos, InnerNeg, EltBits, DAG)) {
+  if (matchRotateSub(InnerPos, InnerNeg, EltBits, DAG, /*IsRotate*/ N0 == N1)) {
     bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT);
     return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, N0, N1,
                        HasPos ? Pos : Neg);
diff --git a/llvm/test/CodeGen/X86/shift-double.ll b/llvm/test/CodeGen/X86/shift-double.ll
index c0872957f2b8..1213a80921d2 100644
--- a/llvm/test/CodeGen/X86/shift-double.ll
+++ b/llvm/test/CodeGen/X86/shift-double.ll
@@ -480,23 +480,31 @@ define i32 @test18(i32 %hi, i32 %lo, i32 %bits) nounwind {
   ret i32 %sh
 }
 
-; PR34641 - Masked Shift Counts
+; These are not valid shld/shrd patterns. When the shift amount modulo
+; the bitwidth is zero, the result should be an OR of both operands not a
+; shift.
 
-define i32 @shld_safe_i32(i32, i32, i32) {
-; X86-LABEL: shld_safe_i32:
+define i32 @not_shld_i32(i32, i32, i32) {
+; X86-LABEL: not_shld_i32:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    negb %cl
+; X86-NEXT:    shrl %cl, %eax
+; X86-NEXT:    orl %edx, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: shld_safe_i32:
+; X64-LABEL: not_shld_i32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edx, %ecx
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    shll %cl, %edi
+; X64-NEXT:    negb %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shldl %cl, %esi, %eax
+; X64-NEXT:    shrl %cl, %eax
+; X64-NEXT:    orl %edi, %eax
 ; X64-NEXT:    retq
   %4 = and i32 %2, 31
   %5 = shl i32 %0, %4
@@ -507,21 +515,27 @@ define i32 @shld_safe_i32(i32, i32, i32) {
   ret i32 %9
 }
 
-define i32 @shrd_safe_i32(i32, i32, i32) {
-; X86-LABEL: shrd_safe_i32:
+define i32 @not_shrd_i32(i32, i32, i32) {
+; X86-LABEL: not_shrd_i32:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shrdl %cl, %edx, %eax
+; X86-NEXT:    shrl %cl, %edx
+; X86-NEXT:    negb %cl
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    orl %edx, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: shrd_safe_i32:
+; X64-LABEL: not_shrd_i32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edx, %ecx
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    shrl %cl, %edi
+; X64-NEXT:    negb %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shrdl %cl, %esi, %eax
+; X64-NEXT:    shll %cl, %eax
+; X64-NEXT:    orl %edi, %eax
 ; X64-NEXT:    retq
   %4 = and i32 %2, 31
   %5 = lshr i32 %0, %4

From 4ed9f17e9390a6845cfd8a235f2078cb9b0e4719 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kirst=C3=B3f=20Umann?= <dkszelethus@gmail.com>
Date: Fri, 5 Feb 2021 19:57:09 +0100
Subject: [PATCH 178/318] [analyzer] Add 12.0.0 release notes

Differential Revision: https://reviews.llvm.org/D96163
---
 clang/docs/ReleaseNotes.rst      | 33 +++++++++++++++++++++++++++++++-
 clang/docs/analyzer/checkers.rst |  2 ++
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 64f737ff488f..7f4b675b68f9 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -377,7 +377,38 @@ libclang
 Static Analyzer
 ---------------
 
-- ...
+.. 3ff220de9009 [analyzer][StdLibraryFunctionsChecker] Add POSIX networking functions
+.. ...And a million other patches.
+- Improve the analyzer's understanding of several POSIX functions.
+
+.. https://reviews.llvm.org/D86533#2238207
+- Greatly improved the analyzer’s constraint solver by better understanding
+  when constraints are imposed on multiple symbolic values that are known to be
+  equal or known to be non-equal. It will now also efficiently reject impossible
+  if-branches between known comparison expressions. (Incorrectly stated as a
+  11.0.0 feature in the previous release notes)
+
+.. 820e8d8656ec [Analyzer][WebKit] UncountedLambdaCaptureChecker
+- New checker: :ref:`webkit.UncountedLambdaCapturesChecker<webkit-UncountedLambdaCapturesChecker>`
+  is a WebKit coding convention checker that flags raw pointers to
+  reference-counted objects captured by lambdas and suggests using intrusive
+  reference-counting smart pointers instead.
+
+.. 8a64689e264c [Analyzer][WebKit] UncountedLocalVarsChecker
+- New checker: :ref:`alpha.webkit.UncountedLocalVarsChecker<alpha-webkit-UncountedLocalVarsChecker>`
+  is a WebKit coding convention checker that intends to make sure that any
+  uncounted local variable is backed by a ref-counted object with lifetime that
+  is strictly larger than the scope of the uncounted local variable.
+
+.. i914f6c4ff8a4 [StaticAnalyzer] Support struct annotations in FuchsiaHandleChecker
+- ``fuchia.HandleChecker`` now recognizes handles in structs; All the handles
+  referenced by the structure (direct value or ptr) would be treated as
+  containing the release/use/acquire annotations directly.
+
+.. 8deaec122ec6 [analyzer] Update Fuchsia checker to catch releasing unowned handles.
+- Fuchsia checkers can detect the release of an unowned handle.
+
+- Numerous fixes and improvements to bug report generation.
 
 .. _release-notes-ubsan:
 
diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index b47be97eef96..d851845396ac 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -2538,6 +2538,8 @@ We also define a set of safe transformations which if passed a safe value as an
 - casts
 - unary operators like ``&`` or ``*``
 
+.. _alpha-webkit-UncountedLocalVarsChecker:
+
 alpha.webkit.UncountedLocalVarsChecker
 """"""""""""""""""""""""""""""""""""""
 The goal of this rule is to make sure that any uncounted local variable is backed by a ref-counted object with lifetime that is strictly larger than the scope of the uncounted local variable. To be on the safe side we require the scope of an uncounted variable to be embedded in the scope of ref-counted object that backs it.

From 99350dcc3f5b46d564338c0067c2cbd139b841ee Mon Sep 17 00:00:00 2001
From: "Peyton, Jonathan L" <jonathan.l.peyton@intel.com>
Date: Tue, 2 Mar 2021 07:44:15 -0600
Subject: [PATCH 179/318] [OpenMP] Fix clang-cl build error regarding TSX
 intrinsics

Fix for https://bugs.llvm.org/show_bug.cgi?id=49339

The CMake check for the RTM intrinsics needs the -mrtm flag to be set
during the test. This way clang-cl correctly detects it has the
_xbegin() intrinsic. Otherwise, the CMake check fails.

Differential Revision: https://reviews.llvm.org/D97413

(cherry picked from commit e83380fccc2cc9842bdcfd268efddf6fce90544d)
---
 openmp/runtime/cmake/config-ix.cmake | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/openmp/runtime/cmake/config-ix.cmake b/openmp/runtime/cmake/config-ix.cmake
index f06fda6c0221..ed62aefccd14 100644
--- a/openmp/runtime/cmake/config-ix.cmake
+++ b/openmp/runtime/cmake/config-ix.cmake
@@ -172,6 +172,10 @@ if (IA32 OR INTEL64)
       }
       int main() { int a = __kmp_umwait(0, 1000); return a; }")
   check_cxx_source_compiles("${source_code}" LIBOMP_HAVE_WAITPKG_INTRINSICS)
+  set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
+  if (LIBOMP_HAVE_MRTM_FLAG)
+    set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -mrtm")
+  endif()
   set(source_code "// check for attribute rtm and rtm intrinsics
       #ifdef IMMINTRIN_H
       #include <immintrin.h>
@@ -188,6 +192,7 @@ if (IA32 OR INTEL64)
       int main() { int a = __kmp_xbegin(); return a; }")
   check_cxx_source_compiles("${source_code}" LIBOMP_HAVE_RTM_INTRINSICS)
   set(CMAKE_REQUIRED_DEFINITIONS)
+  set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
 endif()
 
 # Find perl executable

From 52510d84802b55ecd80a904ca259adfecffc5be1 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Mon, 1 Mar 2021 21:37:26 +0100
Subject: [PATCH 180/318] [GlobalISel] Bail on G_PHI narrowing of odd types
 (PR48188)

The current narrowing code for G_PHI can only handle the case
where the size is a multiple of the narrow size. If this is not
the case, fall back to SDAG instead of asserting.

Original patch by shepmaster.

Differential Revision: https://reviews.llvm.org/D92446

(cherry picked from commit c35761db0f078f74550ef56bfc0745c162d76967)
---
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |  5 ++++
 llvm/test/CodeGen/AArch64/pr48188.ll          | 27 +++++++++++++++++++
 2 files changed, 32 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/pr48188.ll

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index e7f40523efaf..3178ee16af2b 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -1063,6 +1063,11 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     Observer.changedInstr(MI);
     return Legalized;
   case TargetOpcode::G_PHI: {
+    // FIXME: add support for when SizeOp0 isn't an exact multiple of
+    // NarrowSize.
+    if (SizeOp0 % NarrowSize != 0)
+      return UnableToLegalize;
+
     unsigned NumParts = SizeOp0 / NarrowSize;
     SmallVector<Register, 2> DstRegs(NumParts);
     SmallVector<SmallVector<Register, 2>, 2> SrcRegs(MI.getNumOperands() / 2);
diff --git a/llvm/test/CodeGen/AArch64/pr48188.ll b/llvm/test/CodeGen/AArch64/pr48188.ll
new file mode 100644
index 000000000000..2da02e640ec1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/pr48188.ll
@@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+; GlobalISel cannot legalize this phi, so we fall back to SDAG.
+define void @test() nounwind {
+; CHECK-LABEL: test:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16 // =16
+; CHECK-NEXT:    mov x1, xzr
+; CHECK-NEXT:    mov x0, x1
+; CHECK-NEXT:    str x1, [sp] // 8-byte Folded Spill
+; CHECK-NEXT:    str x0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    b .LBB0_1
+; CHECK-NEXT:  .LBB0_1: // %loop
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr x0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr x1, [sp] // 8-byte Folded Reload
+; CHECK-NEXT:    str x1, [sp] // 8-byte Folded Spill
+; CHECK-NEXT:    str x0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    b .LBB0_1
+entry:
+  br label %loop
+
+loop:
+  %p = phi i72 [ 0, %entry ], [ %p, %loop ]
+  br label %loop
+}

From d24e102ba2665dc6cd467f467813fba9c8261133 Mon Sep 17 00:00:00 2001
From: Shilei Tian <tianshilei1992@gmail.com>
Date: Wed, 24 Feb 2021 12:37:22 -0500
Subject: [PATCH 181/318] [OpenMP] Fixed a crash when offloading to x86_64 with
 target nowait

PR#49334 reports a crash when offloading to x86_64 with `target nowait`,
which is caused by referencing a nullptr. The root cause of the issue is, when
pushing a hidden helper task in `__kmp_push_task`, it also maps the gtid to its
shadow gtid, which is wrong.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D97329

(cherry picked from commit e5da63d5a9ede1fb6d8aa18cfd44533ead128738)
---
 .../libomptarget/test/offloading/bug49334.cpp | 148 ++++++++++++++++++
 openmp/runtime/src/kmp_tasking.cpp            |   3 +-
 2 files changed, 150 insertions(+), 1 deletion(-)
 create mode 100644 openmp/libomptarget/test/offloading/bug49334.cpp

diff --git a/openmp/libomptarget/test/offloading/bug49334.cpp b/openmp/libomptarget/test/offloading/bug49334.cpp
new file mode 100644
index 000000000000..b26cd7b2b338
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/bug49334.cpp
@@ -0,0 +1,148 @@
+// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-nvptx64-nvidia-cuda
+
+#include <cassert>
+#include <iostream>
+#include <memory>
+#include <vector>
+
+class BlockMatrix {
+private:
+  const int rowsPerBlock;
+  const int colsPerBlock;
+  const long nRows;
+  const long nCols;
+  const int nBlocksPerRow;
+  const int nBlocksPerCol;
+  std::vector<std::vector<std::unique_ptr<float[]>>> Blocks;
+
+public:
+  BlockMatrix(const int _rowsPerBlock, const int _colsPerBlock,
+              const long _nRows, const long _nCols)
+      : rowsPerBlock(_rowsPerBlock), colsPerBlock(_colsPerBlock), nRows(_nRows),
+        nCols(_nCols), nBlocksPerRow(_nRows / _rowsPerBlock),
+        nBlocksPerCol(_nCols / _colsPerBlock), Blocks(nBlocksPerCol) {
+    for (int i = 0; i < nBlocksPerCol; i++) {
+      for (int j = 0; j < nBlocksPerRow; j++) {
+        Blocks[i].emplace_back(new float[_rowsPerBlock * _colsPerBlock]);
+      }
+    }
+  };
+
+  // Initialize the BlockMatrix from 2D arrays
+  void Initialize(const std::vector<float> &matrix) {
+    for (int i = 0; i < nBlocksPerCol; i++)
+      for (int j = 0; j < nBlocksPerRow; j++) {
+        float *CurrBlock = GetBlock(i, j);
+        for (int ii = 0; ii < colsPerBlock; ++ii)
+          for (int jj = 0; jj < rowsPerBlock; ++jj) {
+            int curri = i * colsPerBlock + ii;
+            int currj = j * rowsPerBlock + jj;
+            CurrBlock[ii + jj * colsPerBlock] = matrix[curri + currj * nCols];
+          }
+      }
+  }
+
+  long Compare(const std::vector<float> &matrix) const {
+    long fail = 0;
+    for (int i = 0; i < nBlocksPerCol; i++)
+      for (int j = 0; j < nBlocksPerRow; j++) {
+        float *CurrBlock = GetBlock(i, j);
+        for (int ii = 0; ii < colsPerBlock; ++ii)
+          for (int jj = 0; jj < rowsPerBlock; ++jj) {
+            int curri = i * colsPerBlock + ii;
+            int currj = j * rowsPerBlock + jj;
+            float m_value = matrix[curri + currj * nCols];
+            float bm_value = CurrBlock[ii + jj * colsPerBlock];
+            if (bm_value != m_value) {
+              fail++;
+            }
+          }
+      }
+    return fail;
+  }
+
+  float *GetBlock(int i, int j) const {
+    assert(i < nBlocksPerCol && j < nBlocksPerRow && "Accessing outside block");
+    return Blocks[i][j].get();
+  }
+};
+
+constexpr const int BS = 256;
+constexpr const int N = 1024;
+
+int BlockMatMul_TargetNowait(BlockMatrix &A, BlockMatrix &B, BlockMatrix &C) {
+#pragma omp parallel
+#pragma omp master
+  for (int i = 0; i < N / BS; ++i)
+    for (int j = 0; j < N / BS; ++j) {
+      float *BlockC = C.GetBlock(i, j);
+      for (int k = 0; k < N / BS; ++k) {
+        float *BlockA = A.GetBlock(i, k);
+        float *BlockB = B.GetBlock(k, j);
+// clang-format off
+#pragma omp target depend(in: BlockA[0], BlockB[0]) depend(inout: BlockC[0])   \
+            map(to: BlockA[:BS * BS], BlockB[:BS * BS])                        \
+            map(tofrom: BlockC[:BS * BS]) nowait
+// clang-format on
+#pragma omp parallel for
+        for (int ii = 0; ii < BS; ii++)
+          for (int jj = 0; jj < BS; jj++) {
+            for (int kk = 0; kk < BS; ++kk)
+              BlockC[ii + jj * BS] +=
+                  BlockA[ii + kk * BS] * BlockB[kk + jj * BS];
+          }
+      }
+    }
+  return 0;
+}
+
+void Matmul(const std::vector<float> &a, const std::vector<float> &b,
+            std::vector<float> &c) {
+  for (int i = 0; i < N; ++i) {
+    for (int j = 0; j < N; ++j) {
+      float sum = 0.0;
+      for (int k = 0; k < N; ++k) {
+        sum = sum + a[i * N + k] * b[k * N + j];
+      }
+      c[i * N + j] = sum;
+    }
+  }
+}
+
+int main(int argc, char *argv[]) {
+  std::vector<float> a(N * N);
+  std::vector<float> b(N * N);
+  std::vector<float> c(N * N, 0.0);
+
+  for (int i = 0; i < N; ++i) {
+    for (int j = 0; j < N; ++j) {
+      a[i * N + j] = b[i * N + j] = i + j % 100;
+    }
+  }
+
+  auto BlockedA = BlockMatrix(BS, BS, N, N);
+  BlockedA.Initialize(a);
+  BlockedA.Compare(a);
+  auto BlockedB = BlockMatrix(BS, BS, N, N);
+  BlockedB.Initialize(b);
+  BlockedB.Compare(b);
+
+  Matmul(a, b, c);
+
+  auto BlockedC = BlockMatrix(BS, BS, N, N);
+  BlockMatMul_TargetNowait(BlockedA, BlockedB, BlockedC);
+
+  if (BlockedC.Compare(c) > 0) {
+    return 1;
+  }
+
+  std::cout << "PASS\n";
+
+  return 0;
+}
+
+// CHECK: PASS
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 3d7021128dbd..4bcd11946694 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -326,7 +326,8 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
 
-  if (taskdata->td_flags.hidden_helper) {
+  // We don't need to map to shadow gtid if it is already hidden helper thread
+  if (taskdata->td_flags.hidden_helper && !KMP_HIDDEN_HELPER_THREAD(gtid)) {
     gtid = KMP_GTID_TO_SHADOW_GTID(gtid);
     thread = __kmp_threads[gtid];
   }

From 46a1b0655666e21c56fa79560e9baee87405d4f4 Mon Sep 17 00:00:00 2001
From: LemonBoy <thatlemon@gmail.com>
Date: Fri, 5 Mar 2021 16:01:45 +0100
Subject: [PATCH 182/318] [AArch64] Legalize horizontal fmax/fmin reductions on
 f16 vectors

Expand the horizontal reduction during the instruction selection phase, but only if the target doesn't support the full fp16 instruction set.

Fixes https://bugs.llvm.org/show_bug.cgi?id=49401

Reviewed By: aemerson

Differential Revision: https://reviews.llvm.org/D97840

(cherry picked from commit 8725b24c6d4abaa97425e704652a13dacb35fe3f)
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  7 ++-
 .../AArch64/vecreduce-fmax-legalization.ll    | 62 ++++++++++++++++++-
 .../AArch64/vecreduce-fmin-legalization.ll    | 62 ++++++++++++++++++-
 3 files changed, 126 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 1be09186dc0a..1451151f4dc5 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1017,11 +1017,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     // Vector reductions
     for (MVT VT : { MVT::v4f16, MVT::v2f32,
                     MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
-      setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
-      setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
+      if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
+        setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
+        setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
 
-      if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16())
         setOperationAction(ISD::VECREDUCE_FADD, VT, Legal);
+      }
     }
     for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
                     MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
index f1ebd8fa85ea..d26db2aefee0 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
@@ -1,11 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOFP
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon,+fullfp16 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP
 
 declare half @llvm.vector.reduce.fmax.v1f16(<1 x half> %a)
 declare float @llvm.vector.reduce.fmax.v1f32(<1 x float> %a)
 declare double @llvm.vector.reduce.fmax.v1f64(<1 x double> %a)
 declare fp128 @llvm.vector.reduce.fmax.v1f128(<1 x fp128> %a)
 
+declare half @llvm.vector.reduce.fmax.v4f16(<4 x half> %a)
 declare float @llvm.vector.reduce.fmax.v3f32(<3 x float> %a)
 declare fp128 @llvm.vector.reduce.fmax.v2f128(<2 x fp128> %a)
 declare float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a)
@@ -44,6 +46,64 @@ define fp128 @test_v1f128(<1 x fp128> %a) nounwind {
   ret fp128 %b
 }
 
+define half @test_v4f16(<4 x half> %a) nounwind {
+; CHECK-NOFP-LABEL: test_v4f16:
+; CHECK-NOFP:       // %bb.0:
+; CHECK-NOFP-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NOFP-NEXT:    mov h3, v0.h[1]
+; CHECK-NOFP-NEXT:    mov h1, v0.h[3]
+; CHECK-NOFP-NEXT:    mov h2, v0.h[2]
+; CHECK-NOFP-NEXT:    fcvt s0, h0
+; CHECK-NOFP-NEXT:    fcvt s3, h3
+; CHECK-NOFP-NEXT:    fmaxnm s0, s0, s3
+; CHECK-NOFP-NEXT:    fcvt h0, s0
+; CHECK-NOFP-NEXT:    fcvt s2, h2
+; CHECK-NOFP-NEXT:    fcvt s0, h0
+; CHECK-NOFP-NEXT:    fmaxnm s0, s0, s2
+; CHECK-NOFP-NEXT:    fcvt h0, s0
+; CHECK-NOFP-NEXT:    fcvt s0, h0
+; CHECK-NOFP-NEXT:    fcvt s1, h1
+; CHECK-NOFP-NEXT:    fmaxnm s0, s0, s1
+; CHECK-NOFP-NEXT:    fcvt h0, s0
+; CHECK-NOFP-NEXT:    ret
+;
+; CHECK-FP-LABEL: test_v4f16:
+; CHECK-FP:       // %bb.0:
+; CHECK-FP-NEXT:    fmaxnmv h0, v0.4h
+; CHECK-FP-NEXT:    ret
+  %b = call nnan half @llvm.vector.reduce.fmax.v4f16(<4 x half> %a)
+  ret half %b
+}
+
+define half @test_v4f16_ninf(<4 x half> %a) nounwind {
+; CHECK-NOFP-LABEL: test_v4f16_ninf:
+; CHECK-NOFP:       // %bb.0:
+; CHECK-NOFP-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NOFP-NEXT:    mov h3, v0.h[1]
+; CHECK-NOFP-NEXT:    mov h1, v0.h[3]
+; CHECK-NOFP-NEXT:    mov h2, v0.h[2]
+; CHECK-NOFP-NEXT:    fcvt s0, h0
+; CHECK-NOFP-NEXT:    fcvt s3, h3
+; CHECK-NOFP-NEXT:    fmaxnm s0, s0, s3
+; CHECK-NOFP-NEXT:    fcvt h0, s0
+; CHECK-NOFP-NEXT:    fcvt s2, h2
+; CHECK-NOFP-NEXT:    fcvt s0, h0
+; CHECK-NOFP-NEXT:    fmaxnm s0, s0, s2
+; CHECK-NOFP-NEXT:    fcvt h0, s0
+; CHECK-NOFP-NEXT:    fcvt s0, h0
+; CHECK-NOFP-NEXT:    fcvt s1, h1
+; CHECK-NOFP-NEXT:    fmaxnm s0, s0, s1
+; CHECK-NOFP-NEXT:    fcvt h0, s0
+; CHECK-NOFP-NEXT:    ret
+;
+; CHECK-FP-LABEL: test_v4f16_ninf:
+; CHECK-FP:       // %bb.0:
+; CHECK-FP-NEXT:    fmaxnmv h0, v0.4h
+; CHECK-FP-NEXT:    ret
+  %b = call nnan ninf half @llvm.vector.reduce.fmax.v4f16(<4 x half> %a)
+  ret half %b
+}
+
 define float @test_v3f32(<3 x float> %a) nounwind {
 ; CHECK-LABEL: test_v3f32:
 ; CHECK:       // %bb.0:
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
index 4129fa80b13e..52d6e9773ab2 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
@@ -1,11 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOFP
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon,+fullfp16 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP
 
 declare half @llvm.vector.reduce.fmin.v1f16(<1 x half> %a)
 declare float @llvm.vector.reduce.fmin.v1f32(<1 x float> %a)
 declare double @llvm.vector.reduce.fmin.v1f64(<1 x double> %a)
 declare fp128 @llvm.vector.reduce.fmin.v1f128(<1 x fp128> %a)
 
+declare half @llvm.vector.reduce.fmin.v4f16(<4 x half> %a)
 declare float @llvm.vector.reduce.fmin.v3f32(<3 x float> %a)
 declare fp128 @llvm.vector.reduce.fmin.v2f128(<2 x fp128> %a)
 declare float @llvm.vector.reduce.fmin.v16f32(<16 x float> %a)
@@ -44,6 +46,64 @@ define fp128 @test_v1f128(<1 x fp128> %a) nounwind {
   ret fp128 %b
 }
 
+define half @test_v4f16(<4 x half> %a) nounwind {
+; CHECK-NOFP-LABEL: test_v4f16:
+; CHECK-NOFP:       // %bb.0:
+; CHECK-NOFP-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NOFP-NEXT:    mov h3, v0.h[1]
+; CHECK-NOFP-NEXT:    mov h1, v0.h[3]
+; CHECK-NOFP-NEXT:    mov h2, v0.h[2]
+; CHECK-NOFP-NEXT:    fcvt s0, h0
+; CHECK-NOFP-NEXT:    fcvt s3, h3
+; CHECK-NOFP-NEXT:    fminnm s0, s0, s3
+; CHECK-NOFP-NEXT:    fcvt h0, s0
+; CHECK-NOFP-NEXT:    fcvt s2, h2
+; CHECK-NOFP-NEXT:    fcvt s0, h0
+; CHECK-NOFP-NEXT:    fminnm s0, s0, s2
+; CHECK-NOFP-NEXT:    fcvt h0, s0
+; CHECK-NOFP-NEXT:    fcvt s0, h0
+; CHECK-NOFP-NEXT:    fcvt s1, h1
+; CHECK-NOFP-NEXT:    fminnm s0, s0, s1
+; CHECK-NOFP-NEXT:    fcvt h0, s0
+; CHECK-NOFP-NEXT:    ret
+;
+; CHECK-FP-LABEL: test_v4f16:
+; CHECK-FP:       // %bb.0:
+; CHECK-FP-NEXT:    fminnmv h0, v0.4h
+; CHECK-FP-NEXT:    ret
+  %b = call nnan half @llvm.vector.reduce.fmin.v4f16(<4 x half> %a)
+  ret half %b
+}
+
+define half @test_v4f16_ninf(<4 x half> %a) nounwind {
+; CHECK-NOFP-LABEL: test_v4f16_ninf:
+; CHECK-NOFP:       // %bb.0:
+; CHECK-NOFP-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NOFP-NEXT:    mov h3, v0.h[1]
+; CHECK-NOFP-NEXT:    mov h1, v0.h[3]
+; CHECK-NOFP-NEXT:    mov h2, v0.h[2]
+; CHECK-NOFP-NEXT:    fcvt s0, h0
+; CHECK-NOFP-NEXT:    fcvt s3, h3
+; CHECK-NOFP-NEXT:    fminnm s0, s0, s3
+; CHECK-NOFP-NEXT:    fcvt h0, s0
+; CHECK-NOFP-NEXT:    fcvt s2, h2
+; CHECK-NOFP-NEXT:    fcvt s0, h0
+; CHECK-NOFP-NEXT:    fminnm s0, s0, s2
+; CHECK-NOFP-NEXT:    fcvt h0, s0
+; CHECK-NOFP-NEXT:    fcvt s0, h0
+; CHECK-NOFP-NEXT:    fcvt s1, h1
+; CHECK-NOFP-NEXT:    fminnm s0, s0, s1
+; CHECK-NOFP-NEXT:    fcvt h0, s0
+; CHECK-NOFP-NEXT:    ret
+;
+; CHECK-FP-LABEL: test_v4f16_ninf:
+; CHECK-FP:       // %bb.0:
+; CHECK-FP-NEXT:    fminnmv h0, v0.4h
+; CHECK-FP-NEXT:    ret
+  %b = call nnan ninf half @llvm.vector.reduce.fmin.v4f16(<4 x half> %a)
+  ret half %b
+}
+
 define float @test_v3f32(<3 x float> %a) nounwind {
 ; CHECK-LABEL: test_v3f32:
 ; CHECK:       // %bb.0:

From f8b32989241cca87a8690c8cc404f06ce1f90e4c Mon Sep 17 00:00:00 2001
From: Nathan James <n.james93@hotmail.co.uk>
Date: Wed, 3 Mar 2021 16:01:12 +0000
Subject: [PATCH 183/318] [clang-tidy] Deprecate readability-deleted-default
 check

... For removal in next release cycle.
The clang warning that does the same thing is enabled by default and typically emits better diagnostics making this check surplus to requirements.

Reviewed By: aaron.ballman

Differential Revision: https://reviews.llvm.org/D97491

(cherry picked from commit 19aefd2d5dc3a8d3b8e81219973828170b7fcd2c)
---
 clang-tools-extra/docs/ReleaseNotes.rst       | 10 ++++++++++
 .../checks/readability-deleted-default.rst    | 20 +++----------------
 2 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index b3c9c829198b..29321bb3eb04 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -358,6 +358,16 @@ Changes in existing checks
 
   Added `std::basic_string_view` to default list of ``string``-like types.
 
+Deprecated checks
+^^^^^^^^^^^^^^^^^
+
+- The :doc:`readability-deleted-default
+  <clang-tidy/checks/readability-deleted-default>` check has been deprecated.
+  
+  The clang warning `Wdefaulted-function-deleted
+  <https://clang.llvm.org/docs/DiagnosticsReference.html#wdefaulted-function-deleted>`_
+  will diagnose the same issues and is enabled by default.
+
 Improvements to include-fixer
 -----------------------------
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability-deleted-default.rst b/clang-tools-extra/docs/clang-tidy/checks/readability-deleted-default.rst
index 00134eb05484..5f2083e00061 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability-deleted-default.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability-deleted-default.rst
@@ -3,20 +3,6 @@
 readability-deleted-default
 ===========================
 
-Checks that constructors and assignment operators marked as ``= default`` are
-not actually deleted by the compiler.
-
-.. code-block:: c++
-
-  class Example {
-  public:
-    // This constructor is deleted because I is missing a default value.
-    Example() = default;
-    // This is fine.
-    Example(const Example& Other) = default;
-    // This operator is deleted because I cannot be assigned (it is const).
-    Example& operator=(const Example& Other) = default;
-
-  private:
-    const int I;
-  };
+This check has been deprecated prefer to make use of the `Wdefaulted-function-deleted
+<https://clang.llvm.org/docs/DiagnosticsReference.html#wdefaulted-function-deleted>`_
+flag.

From a123beacce408af8c2de1f39d522ac6b6c4b5d1b Mon Sep 17 00:00:00 2001
From: Juneyoung Lee <aqjune@gmail.com>
Date: Tue, 9 Feb 2021 14:06:17 +0900
Subject: [PATCH 184/318] [LoopVectorize] Fix VPRecipeBuilder::createEdgeMask
 to correctly generate the mask

This patch fixes pr48832 by correctly generating the mask when a poison value is involved.

Consider this CFG (which is a part of the input):

```
for.body:                                         ; preds = %for.cond
  br i1 true, label %cond.false, label %land.rhs

land.rhs:                                         ; preds = %for.body
  br i1 poison, label %cond.end, label %cond.false

cond.false:                                       ; preds = %for.body, %land.rhs
  br label %cond.end

cond.end:                                         ; preds = %land.rhs, %cond.false
  %cond = phi i32 [ 0, %cond.false ], [ 1, %land.rhs ]

```

The path for.body -> land.rhs -> cond.end should be taken when 'select i1 false, i1 poison, i1 false' holds (which means it's never taken); but VPRecipeBuilder::createEdgeMask was emitting 'and i1 false, poison' instead.
The former one successfully blocks poison propagation whereas the latter one doesn't, making the condition poison and thus causing the miscompilation.

SimplifyCFG has a similar bug (which didn't expose a real-world bug yet), and a patch for this is also ongoing (see https://reviews.llvm.org/D95026).

Reviewed By: bjope

Differential Revision: https://reviews.llvm.org/D95217

(cherry picked from commit ed253ef77248d91a15b3a1aa36c0b74bed8ec8af)
---
 .../Vectorize/LoopVectorizationPlanner.h      |  4 ++
 .../Transforms/Vectorize/LoopVectorize.cpp    | 11 ++++-
 .../LoopVectorize/X86/masked_load_store.ll    | 48 +++++++++----------
 .../x86-interleaved-accesses-masked-group.ll  | 12 ++---
 .../LoopVectorize/if-conversion-nest.ll       |  2 +-
 .../LoopVectorize/if-pred-non-void.ll         |  2 +-
 .../Transforms/LoopVectorize/if-reduction.ll  |  8 ++--
 llvm/test/Transforms/LoopVectorize/pr48832.ll | 40 ++++++++++++++++
 .../LoopVectorize/reduction-inloop-pred.ll    |  4 +-
 .../LoopVectorize/reduction-inloop.ll         |  4 +-
 10 files changed, 93 insertions(+), 42 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/pr48832.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 1795470fa58c..19797e6f7858 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -142,6 +142,10 @@ class VPBuilder {
     return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS});
   }
 
+  VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal) {
+    return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal});
+  }
+
   //===--------------------------------------------------------------------===//
   // RAII helpers.
   //===--------------------------------------------------------------------===//
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 47635dbdda02..d36e078444bc 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8195,8 +8195,15 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
   if (BI->getSuccessor(0) != Dst)
     EdgeMask = Builder.createNot(EdgeMask);
 
-  if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
-    EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
+  if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
+    // The condition is 'SrcMask && EdgeMask', which is equivalent to
+    // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
+    // The select version does not introduce new UB if SrcMask is false and
+    // EdgeMask is poison. Using 'and' here introduces undefined behavior.
+    VPValue *False = Plan->getOrAddVPValue(
+        ConstantInt::getFalse(BI->getCondition()->getType()));
+    EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False);
+  }
 
   return EdgeMaskCache[Edge] = EdgeMask;
 }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
index dddedcb77f67..b464389fe393 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
@@ -2042,10 +2042,10 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea
 ; AVX1-NEXT:    [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], <i1 true, i1 true, i1 true, i1 true>
 ; AVX1-NEXT:    [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true>
 ; AVX1-NEXT:    [[TMP51:%.*]] = xor <4 x i1> [[TMP43]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP52:%.*]] = and <4 x i1> [[TMP48]], [[TMP28]]
-; AVX1-NEXT:    [[TMP53:%.*]] = and <4 x i1> [[TMP49]], [[TMP29]]
-; AVX1-NEXT:    [[TMP54:%.*]] = and <4 x i1> [[TMP50]], [[TMP30]]
-; AVX1-NEXT:    [[TMP55:%.*]] = and <4 x i1> [[TMP51]], [[TMP31]]
+; AVX1-NEXT:    [[TMP52:%.*]] = select <4 x i1> [[TMP28]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer
+; AVX1-NEXT:    [[TMP53:%.*]] = select <4 x i1> [[TMP29]], <4 x i1> [[TMP49]], <4 x i1> zeroinitializer
+; AVX1-NEXT:    [[TMP54:%.*]] = select <4 x i1> [[TMP30]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer
+; AVX1-NEXT:    [[TMP55:%.*]] = select <4 x i1> [[TMP31]], <4 x i1> [[TMP51]], <4 x i1> zeroinitializer
 ; AVX1-NEXT:    [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0
 ; AVX1-NEXT:    [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>*
 ; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]])
@@ -2166,10 +2166,10 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea
 ; AVX2-NEXT:    [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], <i1 true, i1 true, i1 true, i1 true>
 ; AVX2-NEXT:    [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true>
 ; AVX2-NEXT:    [[TMP51:%.*]] = xor <4 x i1> [[TMP43]], <i1 true, i1 true, i1 true, i1 true>
-; AVX2-NEXT:    [[TMP52:%.*]] = and <4 x i1> [[TMP48]], [[TMP28]]
-; AVX2-NEXT:    [[TMP53:%.*]] = and <4 x i1> [[TMP49]], [[TMP29]]
-; AVX2-NEXT:    [[TMP54:%.*]] = and <4 x i1> [[TMP50]], [[TMP30]]
-; AVX2-NEXT:    [[TMP55:%.*]] = and <4 x i1> [[TMP51]], [[TMP31]]
+; AVX2-NEXT:    [[TMP52:%.*]] = select <4 x i1> [[TMP28]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer
+; AVX2-NEXT:    [[TMP53:%.*]] = select <4 x i1> [[TMP29]], <4 x i1> [[TMP49]], <4 x i1> zeroinitializer
+; AVX2-NEXT:    [[TMP54:%.*]] = select <4 x i1> [[TMP30]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer
+; AVX2-NEXT:    [[TMP55:%.*]] = select <4 x i1> [[TMP31]], <4 x i1> [[TMP51]], <4 x i1> zeroinitializer
 ; AVX2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0
 ; AVX2-NEXT:    [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>*
 ; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]])
@@ -2290,10 +2290,10 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea
 ; AVX512-NEXT:    [[TMP49:%.*]] = xor <8 x i1> [[TMP41]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
 ; AVX512-NEXT:    [[TMP50:%.*]] = xor <8 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
 ; AVX512-NEXT:    [[TMP51:%.*]] = xor <8 x i1> [[TMP43]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-; AVX512-NEXT:    [[TMP52:%.*]] = and <8 x i1> [[TMP48]], [[TMP28]]
-; AVX512-NEXT:    [[TMP53:%.*]] = and <8 x i1> [[TMP49]], [[TMP29]]
-; AVX512-NEXT:    [[TMP54:%.*]] = and <8 x i1> [[TMP50]], [[TMP30]]
-; AVX512-NEXT:    [[TMP55:%.*]] = and <8 x i1> [[TMP51]], [[TMP31]]
+; AVX512-NEXT:    [[TMP52:%.*]] = select <8 x i1> [[TMP28]], <8 x i1> [[TMP48]], <8 x i1> zeroinitializer
+; AVX512-NEXT:    [[TMP53:%.*]] = select <8 x i1> [[TMP29]], <8 x i1> [[TMP49]], <8 x i1> zeroinitializer
+; AVX512-NEXT:    [[TMP54:%.*]] = select <8 x i1> [[TMP30]], <8 x i1> [[TMP50]], <8 x i1> zeroinitializer
+; AVX512-NEXT:    [[TMP55:%.*]] = select <8 x i1> [[TMP31]], <8 x i1> [[TMP51]], <8 x i1> zeroinitializer
 ; AVX512-NEXT:    [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0
 ; AVX512-NEXT:    [[TMP57:%.*]] = bitcast double* [[TMP56]] to <8 x double>*
 ; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP57]], i32 8, <8 x i1> [[TMP52]])
@@ -2459,10 +2459,10 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea
 ; AVX1-NEXT:    [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], <i1 true, i1 true, i1 true, i1 true>
 ; AVX1-NEXT:    [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true>
 ; AVX1-NEXT:    [[TMP51:%.*]] = xor <4 x i1> [[TMP43]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT:    [[TMP52:%.*]] = and <4 x i1> [[TMP48]], [[TMP28]]
-; AVX1-NEXT:    [[TMP53:%.*]] = and <4 x i1> [[TMP49]], [[TMP29]]
-; AVX1-NEXT:    [[TMP54:%.*]] = and <4 x i1> [[TMP50]], [[TMP30]]
-; AVX1-NEXT:    [[TMP55:%.*]] = and <4 x i1> [[TMP51]], [[TMP31]]
+; AVX1-NEXT:    [[TMP52:%.*]] = select <4 x i1> [[TMP28]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer
+; AVX1-NEXT:    [[TMP53:%.*]] = select <4 x i1> [[TMP29]], <4 x i1> [[TMP49]], <4 x i1> zeroinitializer
+; AVX1-NEXT:    [[TMP54:%.*]] = select <4 x i1> [[TMP30]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer
+; AVX1-NEXT:    [[TMP55:%.*]] = select <4 x i1> [[TMP31]], <4 x i1> [[TMP51]], <4 x i1> zeroinitializer
 ; AVX1-NEXT:    [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0
 ; AVX1-NEXT:    [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>*
 ; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]])
@@ -2583,10 +2583,10 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea
 ; AVX2-NEXT:    [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], <i1 true, i1 true, i1 true, i1 true>
 ; AVX2-NEXT:    [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true>
 ; AVX2-NEXT:    [[TMP51:%.*]] = xor <4 x i1> [[TMP43]], <i1 true, i1 true, i1 true, i1 true>
-; AVX2-NEXT:    [[TMP52:%.*]] = and <4 x i1> [[TMP48]], [[TMP28]]
-; AVX2-NEXT:    [[TMP53:%.*]] = and <4 x i1> [[TMP49]], [[TMP29]]
-; AVX2-NEXT:    [[TMP54:%.*]] = and <4 x i1> [[TMP50]], [[TMP30]]
-; AVX2-NEXT:    [[TMP55:%.*]] = and <4 x i1> [[TMP51]], [[TMP31]]
+; AVX2-NEXT:    [[TMP52:%.*]] = select <4 x i1> [[TMP28]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer
+; AVX2-NEXT:    [[TMP53:%.*]] = select <4 x i1> [[TMP29]], <4 x i1> [[TMP49]], <4 x i1> zeroinitializer
+; AVX2-NEXT:    [[TMP54:%.*]] = select <4 x i1> [[TMP30]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer
+; AVX2-NEXT:    [[TMP55:%.*]] = select <4 x i1> [[TMP31]], <4 x i1> [[TMP51]], <4 x i1> zeroinitializer
 ; AVX2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0
 ; AVX2-NEXT:    [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>*
 ; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]])
@@ -2707,10 +2707,10 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea
 ; AVX512-NEXT:    [[TMP49:%.*]] = xor <8 x i1> [[TMP41]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
 ; AVX512-NEXT:    [[TMP50:%.*]] = xor <8 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
 ; AVX512-NEXT:    [[TMP51:%.*]] = xor <8 x i1> [[TMP43]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-; AVX512-NEXT:    [[TMP52:%.*]] = and <8 x i1> [[TMP48]], [[TMP28]]
-; AVX512-NEXT:    [[TMP53:%.*]] = and <8 x i1> [[TMP49]], [[TMP29]]
-; AVX512-NEXT:    [[TMP54:%.*]] = and <8 x i1> [[TMP50]], [[TMP30]]
-; AVX512-NEXT:    [[TMP55:%.*]] = and <8 x i1> [[TMP51]], [[TMP31]]
+; AVX512-NEXT:    [[TMP52:%.*]] = select <8 x i1> [[TMP28]], <8 x i1> [[TMP48]], <8 x i1> zeroinitializer
+; AVX512-NEXT:    [[TMP53:%.*]] = select <8 x i1> [[TMP29]], <8 x i1> [[TMP49]], <8 x i1> zeroinitializer
+; AVX512-NEXT:    [[TMP54:%.*]] = select <8 x i1> [[TMP30]], <8 x i1> [[TMP50]], <8 x i1> zeroinitializer
+; AVX512-NEXT:    [[TMP55:%.*]] = select <8 x i1> [[TMP31]], <8 x i1> [[TMP51]], <8 x i1> zeroinitializer
 ; AVX512-NEXT:    [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0
 ; AVX512-NEXT:    [[TMP57:%.*]] = bitcast double* [[TMP56]] to <8 x double>*
 ; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP57]], i32 8, <8 x i1> [[TMP52]])
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
index 285f460d99d5..aa8b1361fe4e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
@@ -408,7 +408,7 @@ define dso_local void @masked_strided1_optsize_unknown_tc(i8* noalias nocapture
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP0]], [[TMP1]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP0]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0
 ; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
 ; DISABLED_MASKED_STRIDED:       pred.load.if:
@@ -520,7 +520,7 @@ define dso_local void @masked_strided1_optsize_unknown_tc(i8* noalias nocapture
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = shl nuw nsw i32 [[INDEX]], 1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP2]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP1]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <8 x i1> [[TMP1]], [[TMP0]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>*
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
@@ -615,7 +615,7 @@ define dso_local void @masked_strided3_optsize_unknown_tc(i8* noalias nocapture
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = mul nsw <8 x i32> [[VEC_IND]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP0]], [[TMP1]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP0]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0
 ; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
 ; DISABLED_MASKED_STRIDED:       pred.load.if:
@@ -727,7 +727,7 @@ define dso_local void @masked_strided3_optsize_unknown_tc(i8* noalias nocapture
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = mul nsw i32 [[INDEX]], 3
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP2]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP1]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <8 x i1> [[TMP1]], [[TMP0]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP3]] to <24 x i8>*
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = and <24 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false>
@@ -1535,7 +1535,7 @@ define dso_local void @masked_strided2_unknown_tc(i8* noalias nocapture readonly
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp sgt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP0]], [[TMP1]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP0]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0
 ; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
 ; DISABLED_MASKED_STRIDED:       pred.load.if:
@@ -1871,7 +1871,7 @@ define dso_local void @masked_strided2_unknown_tc(i8* noalias nocapture readonly
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = shl nuw nsw i32 [[INDEX]], 1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP2]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP1]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <8 x i1> [[TMP1]], [[TMP0]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>*
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP5]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
index 0cba3fc20ed9..f218869c1fbe 100644
--- a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
@@ -39,7 +39,7 @@ define i32 @foo(i32* nocapture %A, i32* nocapture %B, i32 %n) {
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], <i32 19, i32 19, i32 19, i32 19>
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD6]], <i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP12]], <4 x i32> <i32 4, i32 4, i32 4, i32 4>, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
-; CHECK-NEXT:    [[TMP14:%.*]] = and <4 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <4 x i1> [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor <4 x i1> [[TMP11]], <i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[TMP16:%.*]] = and <4 x i1> [[TMP10]], [[TMP15]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP14]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> <i32 9, i32 9, i32 9, i32 9>
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll
index 308377f06856..b8d9b458aa4c 100644
--- a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll
@@ -161,7 +161,7 @@ for.cond.cleanup:                                 ; preds = %if.end
 ; CHECK: %[[CMP1:.+]] = icmp slt <2 x i32> %[[VAL:.+]], <i32 100, i32 100>
 ; CHECK: %[[CMP2:.+]] = icmp sge <2 x i32> %[[VAL]], <i32 200, i32 200>
 ; CHECK: %[[NOT:.+]] = xor <2 x i1> %[[CMP1]], <i1 true, i1 true>
-; CHECK: %[[AND:.+]] = and <2 x i1> %[[CMP2]], %[[NOT]]
+; CHECK: %[[AND:.+]] = select <2 x i1> %[[NOT]], <2 x i1> %[[CMP2]], <2 x i1> zeroinitializer
 ; CHECK: %[[OR:.+]] = or <2 x i1> %[[AND]], %[[CMP1]]
 ; CHECK: %[[EXTRACT:.+]] = extractelement <2 x i1> %[[OR]], i32 0
 ; CHECK: br i1 %[[EXTRACT]], label %[[THEN:[a-zA-Z0-9.]+]], label %[[FI:[a-zA-Z0-9.]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/if-reduction.ll b/llvm/test/Transforms/LoopVectorize/if-reduction.ll
index a97301659cb9..bde4fbcc9d13 100644
--- a/llvm/test/Transforms/LoopVectorize/if-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-reduction.ll
@@ -610,9 +610,9 @@ for.end:                                          ; preds = %for.body, %entry
 ; CHECK-DAG: %[[M1:.*]] = fmul fast <4 x float> %[[V0]], <float 3.000000e+00,
 ; CHECK-DAG: %[[M2:.*]] = fmul fast <4 x float> %[[V0]], <float 2.000000e+00,
 ; CHECK: %[[C11:.*]] = xor <4 x i1> %[[C1]], <i1 true,
-; CHECK-DAG: %[[C12:.*]] = and <4 x i1> %[[C2]], %[[C11]]
+; CHECK-DAG: %[[C12:.*]] = select <4 x i1> %[[C11]], <4 x i1> %[[C2]], <4 x i1> zeroinitializer
 ; CHECK-DAG: %[[C21:.*]] = xor <4 x i1> %[[C2]], <i1 true,
-; CHECK: %[[C22:.*]] = and <4 x i1> %[[C21]], %[[C11]]
+; CHECK: %[[C22:.*]] = select <4 x i1> %[[C11]], <4 x i1> %[[C21]], <4 x i1> zeroinitializer
 ; CHECK: %[[S1:.*]] = select <4 x i1> %[[C22]], <4 x float> %[[M1]], <4 x float> %[[M2]]
 ; CHECK: %[[S2:.*]] = select <4 x i1> %[[C1]], <4 x float> %[[V0]], <4 x float> %[[S1]]
 ; CHECK: fadd fast <4 x float> %[[S2]],
@@ -678,9 +678,9 @@ for.end:                                          ; preds = %for.inc, %entry
 ; CHECK-DAG: %[[SUB:.*]] = fsub fast <4 x float>
 ; CHECK-DAG: %[[ADD:.*]] = fadd fast <4 x float>
 ; CHECK: %[[C11:.*]] = xor <4 x i1> %[[C1]], <i1 true,
-; CHECK-DAG: %[[C12:.*]] = and <4 x i1> %[[C2]], %[[C11]]
+; CHECK-DAG: %[[C12:.*]] = select <4 x i1> %[[C11]], <4 x i1> %[[C2]], <4 x i1> zeroinitializer
 ; CHECK-DAG: %[[C21:.*]] = xor <4 x i1> %[[C2]], <i1 true,
-; CHECK: %[[C22:.*]] = and <4 x i1> %[[C21]], %[[C11]]
+; CHECK: %[[C22:.*]] = select <4 x i1> %[[C11]], <4 x i1> %[[C21]], <4 x i1> zeroinitializer
 ; CHECK: %[[S1:.*]] = select <4 x i1> %[[C12]], <4 x float> %[[SUB]], <4 x float> %[[ADD]]
 ; CHECK: %[[S2:.*]] = select <4 x i1> %[[C22]], {{.*}} <4 x float> %[[S1]]
 define float @fcmp_fadd_fsub(float* nocapture readonly %a, i32 %n) nounwind readonly {
diff --git a/llvm/test/Transforms/LoopVectorize/pr48832.ll b/llvm/test/Transforms/LoopVectorize/pr48832.ll
new file mode 100644
index 000000000000..620da918bb47
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/pr48832.ll
@@ -0,0 +1,40 @@
+; RUN: opt -loop-vectorize -force-vector-width=4 -S -o - < %s | FileCheck %s
+%arrayt = type [64 x i32]
+
+@v_146 = external global %arrayt, align 1
+
+; Since the program has well defined behavior, it should not introduce store poison
+; CHECK: vector.ph:
+; CHECK-NEXT: br label %vector.body
+; CHECK: vector.body:
+; CHECK: store <4 x i32> zeroinitializer,
+; CHECK: br i1 %{{.*}}, label %middle.block, label %vector.body
+
+define void @foo() {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %cond.end, %entry
+  %storemerge = phi i16 [ 0, %entry ], [ %inc, %cond.end ]
+  %cmp = icmp slt i16 %storemerge, 15
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  br i1 true, label %cond.false, label %land.rhs
+
+land.rhs:                                         ; preds = %for.body
+  br i1 poison, label %cond.end, label %cond.false
+
+cond.false:                                       ; preds = %for.body, %land.rhs
+  br label %cond.end
+
+cond.end:                                         ; preds = %land.rhs, %cond.false
+  %cond = phi i32 [ 0, %cond.false ], [ 1, %land.rhs ]
+  %arrayidx = getelementptr inbounds %arrayt, %arrayt* @v_146, i16 0, i16 %storemerge
+  store i32 %cond, i32* %arrayidx, align 1
+  %inc = add nsw i16 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
index d1b99e4e403b..e8271b9c5984 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
@@ -1542,8 +1542,8 @@ define float @reduction_conditional(float* %A, float* %B, float* %C, float %S) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ule <4 x float> [[WIDE_LOAD1]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
 ; CHECK-NEXT:    [[TMP7:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
-; CHECK-NEXT:    [[TMP8:%.*]] = and <4 x i1> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP9:%.*]] = and <4 x i1> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP8:%.*]] = and <4 x i1> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <4 x i1> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[TMP7]], <i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[TMP11:%.*]] = and <4 x i1> [[TMP8]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
index 23bfc39bf646..b295090ca928 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
@@ -814,8 +814,8 @@ define float @reduction_conditional(float* %A, float* %B, float* %C, float %S) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ule <4 x float> [[WIDE_LOAD1]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
 ; CHECK-NEXT:    [[TMP7:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
-; CHECK-NEXT:    [[TMP8:%.*]] = and <4 x i1> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP9:%.*]] = and <4 x i1> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP8:%.*]] = and <4 x i1> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <4 x i1> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[TMP7]], <i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[TMP11:%.*]] = and <4 x i1> [[TMP8]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>

From bff59aca162ef16d7634dc9df39f1f3af31ecb93 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 4 Mar 2021 22:30:38 -0800
Subject: [PATCH 185/318] [TargetLowering] Use HandleSDNodes to prevent nodes
 from being deleted by recursive calls in getNegatedExpression.

For binary or ternary ops we call getNegatedExpression multiple
times and then compare costs. While we're doing this we need to
hold a node from the first call across the second call, but its
not yet attached to the DAG. Its possible the second call creates
an identical node and then decides it didn't need it so will try
to delete it if it has no uses. This can cause a reference to the
node we're holding further up the call stack to become invalidated.

To prevent this, we can use a HandleSDNode to artifically give
the node a use without connecting it to the DAG.

I've used a std::list of HandleSDNodes so we can create handles
only when we have a node to hold. HandleSDNode does not have
default constructor and cannot be copied or moved.

Fixes PR49393.

Reviewed By: spatel

Differential Revision: https://reviews.llvm.org/D97914

(cherry picked from commit 74e6030bcbcc8e628f9a99a424342a0c656456f9)
---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 29 ++++++++++
 llvm/test/CodeGen/X86/pr49393.ll              | 55 +++++++++++++++++++
 2 files changed, 84 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/pr49393.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 7145fc91d5f3..b0ad86899d25 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -5935,6 +5935,11 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
 
   SDLoc DL(Op);
 
+  // Because getNegatedExpression can delete nodes we need a handle to keep
+  // temporary nodes alive in case the recursion manages to create an identical
+  // node.
+  std::list<HandleSDNode> Handles;
+
   switch (Opcode) {
   case ISD::ConstantFP: {
     // Don't invert constant FP values after legalization unless the target says
@@ -6003,11 +6008,18 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
     NegatibleCost CostX = NegatibleCost::Expensive;
     SDValue NegX =
         getNegatedExpression(X, DAG, LegalOps, OptForSize, CostX, Depth);
+    // Prevent this node from being deleted by the next call.
+    if (NegX)
+      Handles.emplace_back(NegX);
+
     // fold (fneg (fadd X, Y)) -> (fsub (fneg Y), X)
     NegatibleCost CostY = NegatibleCost::Expensive;
     SDValue NegY =
         getNegatedExpression(Y, DAG, LegalOps, OptForSize, CostY, Depth);
 
+    // We're done with the handles.
+    Handles.clear();
+
     // Negate the X if its cost is less or equal than Y.
     if (NegX && (CostX <= CostY)) {
       Cost = CostX;
@@ -6052,11 +6064,18 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
     NegatibleCost CostX = NegatibleCost::Expensive;
     SDValue NegX =
         getNegatedExpression(X, DAG, LegalOps, OptForSize, CostX, Depth);
+    // Prevent this node from being deleted by the next call.
+    if (NegX)
+      Handles.emplace_back(NegX);
+
     // fold (fneg (fmul X, Y)) -> (fmul X, (fneg Y))
     NegatibleCost CostY = NegatibleCost::Expensive;
     SDValue NegY =
         getNegatedExpression(Y, DAG, LegalOps, OptForSize, CostY, Depth);
 
+    // We're done with the handles.
+    Handles.clear();
+
     // Negate the X if its cost is less or equal than Y.
     if (NegX && (CostX <= CostY)) {
       Cost = CostX;
@@ -6094,15 +6113,25 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
     if (!NegZ)
       break;
 
+    // Prevent this node from being deleted by the next two calls.
+    Handles.emplace_back(NegZ);
+
     // fold (fneg (fma X, Y, Z)) -> (fma (fneg X), Y, (fneg Z))
     NegatibleCost CostX = NegatibleCost::Expensive;
     SDValue NegX =
         getNegatedExpression(X, DAG, LegalOps, OptForSize, CostX, Depth);
+    // Prevent this node from being deleted by the next call.
+    if (NegX)
+      Handles.emplace_back(NegX);
+
     // fold (fneg (fma X, Y, Z)) -> (fma X, (fneg Y), (fneg Z))
     NegatibleCost CostY = NegatibleCost::Expensive;
     SDValue NegY =
         getNegatedExpression(Y, DAG, LegalOps, OptForSize, CostY, Depth);
 
+    // We're done with the handles.
+    Handles.clear();
+
     // Negate the X if its cost is less or equal than Y.
     if (NegX && (CostX <= CostY)) {
       Cost = std::min(CostX, CostZ);
diff --git a/llvm/test/CodeGen/X86/pr49393.ll b/llvm/test/CodeGen/X86/pr49393.ll
new file mode 100644
index 000000000000..9952b90fc7b7
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr49393.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+define void @f() {
+; CHECK-LABEL: f:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_1: # %for.cond
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    imull %eax, %eax
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movapd %xmm0, %xmm1
+; CHECK-NEXT:    mulsd %xmm0, %xmm1
+; CHECK-NEXT:    subsd %xmm0, %xmm1
+; CHECK-NEXT:    cwtl
+; CHECK-NEXT:    xorps %xmm2, %xmm2
+; CHECK-NEXT:    cvtsi2sd %eax, %xmm2
+; CHECK-NEXT:    mulsd %xmm0, %xmm2
+; CHECK-NEXT:    mulsd %xmm0, %xmm2
+; CHECK-NEXT:    movapd %xmm2, %xmm3
+; CHECK-NEXT:    mulsd %xmm1, %xmm3
+; CHECK-NEXT:    mulsd %xmm0, %xmm2
+; CHECK-NEXT:    subsd %xmm3, %xmm1
+; CHECK-NEXT:    addsd %xmm2, %xmm1
+; CHECK-NEXT:    cvttsd2si %xmm1, %eax
+; CHECK-NEXT:    jmp .LBB0_1
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond, %entry
+  %b.0 = phi i16 [ 0, %entry ], [ %conv77, %for.cond ]
+  %mul18 = mul i16 %b.0, %b.0
+  %arrayidx.real = load double, double* undef, align 1
+  %arrayidx.imag = load double, double* undef, align 1
+  %mul_ac = fmul fast double %arrayidx.real, %arrayidx.real
+  %0 = fadd fast double 0.000000e+00, %arrayidx.real
+  %sub.r = fsub fast double %mul_ac, %0
+  %sub.i = fsub fast double 0.000000e+00, %arrayidx.imag
+  %conv28 = sitofp i16 %mul18 to double
+  %mul_bc32 = fmul fast double %arrayidx.imag, %conv28
+  %mul_bd46 = fmul fast double %mul_bc32, %arrayidx.imag
+  %mul_r49 = fsub fast double 0.000000e+00, %mul_bd46
+  %mul_ac59 = fmul fast double %mul_r49, %sub.r
+  %mul_bc48 = fmul fast double %mul_bc32, %arrayidx.real
+  %mul_i50 = fadd fast double 0.000000e+00, %mul_bc48
+  %1 = fmul fast double %mul_i50, %sub.i
+  %.neg = fneg fast double %0
+  %.neg19 = fmul fast double %1, -1.000000e+00
+  %.neg20 = fadd fast double %.neg, %mul_ac
+  %2 = fadd fast double %.neg20, %mul_ac59
+  %sub.r75 = fadd fast double %2, %.neg19
+  %conv77 = fptosi double %sub.r75 to i16
+  br label %for.cond
+}

From 15d1ee36720ff24323f55452ae3cfb63f318c3f3 Mon Sep 17 00:00:00 2001
From: Raul Tambre <raul.tambre@cleveron.com>
Date: Sat, 6 Mar 2021 11:45:57 +0200
Subject: [PATCH 186/318] [CMake][compiler-rt] Use copying instead of
 symlinking for LSE builtins on non-Unix-likes

As reported in D93278 post-review symlinking requires privilege escalation on Windows.
Copying is functionally same, so fallback to it for systems that aren't Unix-like.
This is similar to the solution in AddLLVM.cmake.

Reviewed By: ikudrin

Differential Revision: https://reviews.llvm.org/D98111

(cherry picked from commit ba860963b156db3b653c67ef044df877f3cea9cc)
---
 compiler-rt/lib/builtins/CMakeLists.txt | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index b511a9a987b3..73b6bead8424 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -515,6 +515,12 @@ set(aarch64_SOURCES
 set(OA_HELPERS_DIR "${CMAKE_CURRENT_BINARY_DIR}/outline_atomic_helpers.dir")
 file(MAKE_DIRECTORY "${OA_HELPERS_DIR}")
 
+if(CMAKE_HOST_UNIX)
+  set(COMPILER_RT_LINK_OR_COPY create_symlink)
+else()
+  set(COMPILER_RT_LINK_OR_COPY copy)
+endif()
+
 foreach(pat cas swp ldadd ldclr ldeor ldset)
   foreach(size 1 2 4 8 16)
     foreach(model 1 2 3 4)
@@ -522,7 +528,7 @@ foreach(pat cas swp ldadd ldclr ldeor ldset)
         set(helper_asm "${OA_HELPERS_DIR}/outline_atomic_${pat}${size}_${model}.S")
         add_custom_command(
           OUTPUT ${helper_asm}
-          COMMAND ${CMAKE_COMMAND} -E create_symlink "${CMAKE_CURRENT_SOURCE_DIR}/aarch64/lse.S" "${helper_asm}"
+          COMMAND ${CMAKE_COMMAND} -E ${COMPILER_RT_LINK_OR_COPY} "${CMAKE_CURRENT_SOURCE_DIR}/aarch64/lse.S" "${helper_asm}"
         )
         set_source_files_properties("${helper_asm}"
           PROPERTIES

From c016eda3257eb0f67a989065d174bc5e13ed7096 Mon Sep 17 00:00:00 2001
From: Amilendra Kodithuwakku <amilendra.kodithuwakku@arm.com>
Date: Fri, 12 Mar 2021 19:19:29 +0000
Subject: [PATCH 187/318] [release][docs] List all cores Arm has added support
 for in LLVM 12.

Reviewed By: kristof.beyls

Differential Revision: https://reviews.llvm.org/D98277
---
 clang/docs/ReleaseNotes.rst | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 7f4b675b68f9..46e11fcb31cb 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -139,6 +139,15 @@ Modified Compiler Flags
   This behavior matches newer GCC.
   (`D91760 <https://reviews.llvm.org/D91760>`_)
   (`D92054 <https://reviews.llvm.org/D92054>`_)
+- Support has been added for the following processors (command-line identifiers
+  in parentheses):
+  - Arm Cortex-A78C (cortex-a78c).
+  - Arm Cortex-R82 (cortex-r82).
+  - Arm Neoverse V1 (neoverse-v1).
+  - Arm Neoverse N2 (neoverse-n2).
+  - Fujitsu A64FX (a64fx).
+  For example, to select architecture support and tuning for Neoverse-V1 based
+  systems, use ``-mcpu=neoverse-v1``.
 
 Removed Compiler Flags
 -------------------------

From ca14f0282fcec0324b921d27907a704b3a156d0f Mon Sep 17 00:00:00 2001
From: Amilendra Kodithuwakku <amilendra.kodithuwakku@arm.com>
Date: Fri, 12 Mar 2021 20:03:23 +0000
Subject: [PATCH 188/318] [release][docs] List all cores Arm has added support
 for in LLVM 12.

Add new-line before sub-list for proper rendering.

Differential Revision: https://reviews.llvm.org/D98277
---
 clang/docs/ReleaseNotes.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 46e11fcb31cb..451bc65b9f5b 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -141,6 +141,7 @@ Modified Compiler Flags
   (`D92054 <https://reviews.llvm.org/D92054>`_)
 - Support has been added for the following processors (command-line identifiers
   in parentheses):
+
   - Arm Cortex-A78C (cortex-a78c).
   - Arm Cortex-R82 (cortex-r82).
   - Arm Neoverse V1 (neoverse-v1).

From 00441b8f4e5b7daa39ac6cbeb45ebfe54662b08d Mon Sep 17 00:00:00 2001
From: Anastasia Stulova <anastasia.stulova@arm.com>
Date: Tue, 16 Mar 2021 12:07:15 +0000
Subject: [PATCH 189/318] [OpenCL][Docs] Release notes

Differential Revision: https://reviews.llvm.org/D98076
---
 clang/docs/ReleaseNotes.rst | 36 ++++++++++++++++++++++++++++++++----
 1 file changed, 32 insertions(+), 4 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 451bc65b9f5b..b35d81c60b7b 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -210,10 +210,38 @@ C++1z Feature Support
 Objective-C Language Changes in Clang
 -------------------------------------
 
-OpenCL C Language Changes in Clang
-----------------------------------
-
-...
+OpenCL Kernel Language Changes in Clang
+---------------------------------------
+
+- Improved online documentation: :doc:`UsersManual` and :doc:`OpenCLSupport`
+  pages.
+- Added ``-cl-std=CL3.0`` and predefined version macro for OpenCL 3.0.
+- Added ``-cl-std=CL1.0`` and mapped to the existing OpenCL 1.0 functionality.
+- Improved OpenCL extension handling per target.
+- Added clang extension for function pointers ``__cl_clang_function_pointers``
+  and variadic functions ``__cl_clang_variadic_functions``, more details can be
+  found in :doc:`LanguageExtensions`.
+- Removed extensions without kernel language changes:
+  ``cl_khr_select_fprounding_mode``, ``cl_khr_gl_sharing``, ``cl_khr_icd``,
+  ``cl_khr_gl_event``, ``cl_khr_d3d10_sharing``, ``cl_khr_context_abort``,
+  ``cl_khr_d3d11_sharing``, ``cl_khr_dx9_media_sharing``,
+  ``cl_khr_image2d_from_buffer``, ``cl_khr_initialize_memory``,
+  ``cl_khr_gl_depth_images``, ``cl_khr_spir``, ``cl_khr_egl_event``,
+  ``cl_khr_egl_image``, ``cl_khr_terminate_context``.
+- Improved diagnostics for  unevaluated ``vec_step`` expression.
+- Allow nested pointers (e.g. pointer-to-pointer) kernel arguments beyond OpenCL
+  1.2.
+- Added ``global_device`` and ``global_host`` address spaces for USM
+  allocations.
+
+Miscellaneous improvements in C++ for OpenCL support:
+
+- Added diagnostics for pointers to member functions and references to
+  functions.
+- Added support of ``vec_step`` builtin.
+- Fixed ICE on address spaces with forwarding references and templated copy
+  constructors.
+- Removed warning for variadic macro use.
 
 ABI Changes in Clang
 --------------------

From e3186ba0f3b5a5cf2a42155ff5ee8350cbda1486 Mon Sep 17 00:00:00 2001
From: Maxim Kuvyrkov <maxim.kuvyrkov@linaro.org>
Date: Thu, 18 Mar 2021 16:08:58 +0000
Subject: [PATCH 190/318] [aarch64][WOA64][docs] Release note for WoA-hosted
 LLVM 12 binary

Reviewed By: peterwaller-arm

Differential Revision: https://reviews.llvm.org/D98415
---
 clang/docs/ReleaseNotes.rst | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index b35d81c60b7b..f3499d167361 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -193,6 +193,13 @@ Windows Support
   exception. To workaround (with reduced security), compile with
   /guard:cf,nolongjmp.
 
+- Windows on Arm64: LLVM 12 adds official binary release hosted on
+  Windows on Arm64.  The binary is built and tested by Linaro alongside
+  AArch64 and ARM 32-bit Linux binary releases.  This first WoA release
+  includes Clang compiler, LLD Linker, and compiler-rt runtime libraries.
+  Work on LLDB, sanitizer support, OpenMP, and other features is in progress
+  and will be included in future Windows on Arm64 LLVM releases.
+
 C Language Changes in Clang
 ---------------------------
 

From 4990141a4366eb00abdc8252d7cbb8adeacb9954 Mon Sep 17 00:00:00 2001
From: Maxim Kuvyrkov <maxim.kuvyrkov@linaro.org>
Date: Fri, 19 Mar 2021 13:37:19 +0000
Subject: [PATCH 191/318] [WoA][MSVC] Use default linker setting in
 MSVC-compatible driver [take 2]

At the moment "link.exe" is hard-coded as default linker in MSVC.cpp,
so there's no way to use LLD as default linker for MSVC driver.

This patch adds checking of CLANG_DEFAULT_LINKER to MSVC.cpp and
updates unit-tests that expect link.exe linker to explicitly select it
via -fuse-ld=link, so that buildbots and other builds that set
-DCLANG_DEFAULT_LINKER=foobar don't fail these tests.

This is a squash of
- https://reviews.llvm.org/D98493 (MSVC.cpp change) and
- https://reviews.llvm.org/D98862 (unit-tests change)

Fixes https://bugs.llvm.org/show_bug.cgi?id=49624

Reviewed By: maxim-kuvyrkov

Differential Revision: https://reviews.llvm.org/D98935

(cherry-picked from commit 2049fe58903b68f66872a18e608f40e5233b55fb)
---
 clang/lib/Driver/ToolChains/MSVC.cpp |  6 +++++-
 clang/test/Driver/Xlinker-args.c     |  2 +-
 clang/test/Driver/cl-inputs.c        |  6 +++---
 clang/test/Driver/cl-link-at-file.c  |  2 +-
 clang/test/Driver/cl-link.c          | 22 +++++++++++-----------
 clang/test/Driver/msvc-link.c        |  8 ++++----
 clang/test/OpenMP/linking.c          |  4 ++--
 7 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp
index f4b7a57e0bb7..13943b6c404a 100644
--- a/clang/lib/Driver/ToolChains/MSVC.cpp
+++ b/clang/lib/Driver/ToolChains/MSVC.cpp
@@ -11,6 +11,7 @@
 #include "Darwin.h"
 #include "clang/Basic/CharInfo.h"
 #include "clang/Basic/Version.h"
+#include "clang/Config/config.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/DriverDiagnostic.h"
@@ -520,7 +521,10 @@ void visualstudio::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   // translate 'lld' into 'lld-link', and in the case of the regular msvc
   // linker, we need to use a special search algorithm.
   llvm::SmallString<128> linkPath;
-  StringRef Linker = Args.getLastArgValue(options::OPT_fuse_ld_EQ, "link");
+  StringRef Linker
+    = Args.getLastArgValue(options::OPT_fuse_ld_EQ, CLANG_DEFAULT_LINKER);
+  if (Linker.empty())
+    Linker = "link";
   if (Linker.equals_lower("lld"))
     Linker = "lld-link";
 
diff --git a/clang/test/Driver/Xlinker-args.c b/clang/test/Driver/Xlinker-args.c
index a44957cd8aef..cb045a1d40ac 100644
--- a/clang/test/Driver/Xlinker-args.c
+++ b/clang/test/Driver/Xlinker-args.c
@@ -17,7 +17,7 @@
 // LINUX: "--no-demangle" "-e" "_start" "one" "two" "three" "four" "-z" "five" "-r" {{.*}} "-T" "a.lds"
 
 // Check that we forward '-Xlinker' and '-Wl,' on Windows.
-// RUN: %clang -target i686-pc-win32 -### \
+// RUN: %clang -target i686-pc-win32 -fuse-ld=link -### \
 // RUN:   -Xlinker one -Wl,two %s 2>&1 | \
 // RUN:   FileCheck -check-prefix=WIN %s
 // WIN: link.exe
diff --git a/clang/test/Driver/cl-inputs.c b/clang/test/Driver/cl-inputs.c
index 59455a0aa5e5..8eb44517ee16 100644
--- a/clang/test/Driver/cl-inputs.c
+++ b/clang/test/Driver/cl-inputs.c
@@ -50,16 +50,16 @@
 // RUN: %clang_cl -### /Tc - 2>&1 | FileCheck -check-prefix=STDINTc %s
 // STDINTc: "-x" "c"
 
-// RUN: env LIB=%S/Inputs/cl-libs %clang_cl -### -- %s cl-test.lib 2>&1 | FileCheck -check-prefix=LIBINPUT %s
+// RUN: env LIB=%S/Inputs/cl-libs %clang_cl -fuse-ld=link -### -- %s cl-test.lib 2>&1 | FileCheck -check-prefix=LIBINPUT %s
 // LIBINPUT: link.exe"
 // LIBINPUT: "cl-test.lib"
 
-// RUN: env LIB=%S/Inputs/cl-libs %clang_cl -### -- %s cl-test2.lib 2>&1 | FileCheck -check-prefix=LIBINPUT2 %s
+// RUN: env LIB=%S/Inputs/cl-libs %clang_cl -fuse-ld=link -### -- %s cl-test2.lib 2>&1 | FileCheck -check-prefix=LIBINPUT2 %s
 // LIBINPUT2: error: no such file or directory: 'cl-test2.lib'
 // LIBINPUT2: link.exe"
 // LIBINPUT2-NOT: "cl-test2.lib"
 
-// RUN: %clang_cl -### -- %s /nonexisting.lib 2>&1 | FileCheck -check-prefix=LIBINPUT3 %s
+// RUN: %clang_cl -fuse-ld=link -### -- %s /nonexisting.lib 2>&1 | FileCheck -check-prefix=LIBINPUT3 %s
 // LIBINPUT3: error: no such file or directory: '/nonexisting.lib'
 // LIBINPUT3: link.exe"
 // LIBINPUT3-NOT: "/nonexisting.lib"
diff --git a/clang/test/Driver/cl-link-at-file.c b/clang/test/Driver/cl-link-at-file.c
index 50ae07fadf5b..4e665f89b74e 100644
--- a/clang/test/Driver/cl-link-at-file.c
+++ b/clang/test/Driver/cl-link-at-file.c
@@ -7,7 +7,7 @@
 
 // RUN: echo /link bar.lib baz.lib > %t.args
 // RUN: touch %t.obj
-// RUN: %clang_cl -### @%t.args -- %t.obj 2>&1 | FileCheck %s -check-prefix=ARGS
+// RUN: %clang_cl -fuse-ld=link -### @%t.args -- %t.obj 2>&1 | FileCheck %s -check-prefix=ARGS
 // If the "/link" option captures all remaining args beyond its response file,
 // it will also capture "--" and our input argument. In this case, Clang will
 // be clueless and will emit "argument unused" warnings. If PR17239 is properly
diff --git a/clang/test/Driver/cl-link.c b/clang/test/Driver/cl-link.c
index 142725fed8eb..e2f5397e9133 100644
--- a/clang/test/Driver/cl-link.c
+++ b/clang/test/Driver/cl-link.c
@@ -2,14 +2,14 @@
 // be interpreted as a command-line option, e.g. on Mac where %s is commonly
 // under /Users.
 
-// RUN: %clang_cl /Tc%s -### /link foo bar baz 2>&1 | FileCheck --check-prefix=LINK %s
-// RUN: %clang_cl /Tc%s -### /linkfoo bar baz 2>&1 | FileCheck --check-prefix=LINK %s
+// RUN: %clang_cl /Tc%s -fuse-ld=link -### /link foo bar baz 2>&1 | FileCheck --check-prefix=LINK %s
+// RUN: %clang_cl /Tc%s -fuse-ld=link -### /linkfoo bar baz 2>&1 | FileCheck --check-prefix=LINK %s
 // LINK: link.exe
 // LINK: "foo"
 // LINK: "bar"
 // LINK: "baz"
 
-// RUN: %clang_cl -m32 -arch:IA32 --target=i386-pc-win32 /Tc%s -### -fsanitize=address 2>&1 | FileCheck --check-prefix=ASAN %s
+// RUN: %clang_cl -m32 -arch:IA32 --target=i386-pc-win32 /Tc%s -fuse-ld=link -### -fsanitize=address 2>&1 | FileCheck --check-prefix=ASAN %s
 // ASAN: link.exe
 // ASAN: "-debug"
 // ASAN: "-incremental:no"
@@ -19,7 +19,7 @@
 // ASAN: "-wholearchive:{{.*}}clang_rt.asan_cxx-i386.lib"
 // ASAN: "{{.*}}cl-link{{.*}}.obj"
 
-// RUN: %clang_cl -m32 -arch:IA32 --target=i386-pc-win32 /MD /Tc%s -### -fsanitize=address 2>&1 | FileCheck --check-prefix=ASAN-MD %s
+// RUN: %clang_cl -m32 -arch:IA32 --target=i386-pc-win32 /MD /Tc%s -fuse-ld=link -### -fsanitize=address 2>&1 | FileCheck --check-prefix=ASAN-MD %s
 // ASAN-MD: link.exe
 // ASAN-MD: "-debug"
 // ASAN-MD: "-incremental:no"
@@ -29,13 +29,13 @@
 // ASAN-MD: "-wholearchive:{{.*}}clang_rt.asan_dynamic_runtime_thunk-i386.lib"
 // ASAN-MD: "{{.*}}cl-link{{.*}}.obj"
 
-// RUN: %clang_cl /LD -### /Tc%s 2>&1 | FileCheck --check-prefix=DLL %s
-// RUN: %clang_cl /LDd -### /Tc%s 2>&1 | FileCheck --check-prefix=DLL %s
+// RUN: %clang_cl /LD -fuse-ld=link -### /Tc%s 2>&1 | FileCheck --check-prefix=DLL %s
+// RUN: %clang_cl /LDd -fuse-ld=link -### /Tc%s 2>&1 | FileCheck --check-prefix=DLL %s
 // DLL: link.exe
 // "-dll"
 
-// RUN: %clang_cl -m32 -arch:IA32 --target=i386-pc-win32 /LD /Tc%s -### -fsanitize=address 2>&1 | FileCheck --check-prefix=ASAN-DLL %s
-// RUN: %clang_cl -m32 -arch:IA32 --target=i386-pc-win32 /LDd /Tc%s -### -fsanitize=address 2>&1 | FileCheck --check-prefix=ASAN-DLL %s
+// RUN: %clang_cl -m32 -arch:IA32 --target=i386-pc-win32 /LD /Tc%s -fuse-ld=link -### -fsanitize=address 2>&1 | FileCheck --check-prefix=ASAN-DLL %s
+// RUN: %clang_cl -m32 -arch:IA32 --target=i386-pc-win32 /LDd /Tc%s -fuse-ld=link -### -fsanitize=address 2>&1 | FileCheck --check-prefix=ASAN-DLL %s
 // ASAN-DLL: link.exe
 // ASAN-DLL: "-dll"
 // ASAN-DLL: "-debug"
@@ -43,13 +43,13 @@
 // ASAN-DLL: "{{.*}}clang_rt.asan_dll_thunk-i386.lib"
 // ASAN-DLL: "{{.*}}cl-link{{.*}}.obj"
 
-// RUN: %clang_cl /Zi /Tc%s -### 2>&1 | FileCheck --check-prefix=DEBUG %s
+// RUN: %clang_cl /Zi /Tc%s -fuse-ld=link -### 2>&1 | FileCheck --check-prefix=DEBUG %s
 // DEBUG: link.exe
 // DEBUG: "-debug"
 
 // PR27234
-// RUN: %clang_cl /Tc%s nonexistent.obj -### /link /libpath:somepath 2>&1 | FileCheck --check-prefix=NONEXISTENT %s
-// RUN: %clang_cl /Tc%s nonexistent.lib -### /link /libpath:somepath 2>&1 | FileCheck --check-prefix=NONEXISTENT %s
+// RUN: %clang_cl /Tc%s nonexistent.obj -fuse-ld=link -### /link /libpath:somepath 2>&1 | FileCheck --check-prefix=NONEXISTENT %s
+// RUN: %clang_cl /Tc%s nonexistent.lib -fuse-ld=link -### /link /libpath:somepath 2>&1 | FileCheck --check-prefix=NONEXISTENT %s
 // NONEXISTENT-NOT: no such file
 // NONEXISTENT: link.exe
 // NONEXISTENT: "/libpath:somepath"
diff --git a/clang/test/Driver/msvc-link.c b/clang/test/Driver/msvc-link.c
index 13dccd21bfd8..1ee17fc63c32 100644
--- a/clang/test/Driver/msvc-link.c
+++ b/clang/test/Driver/msvc-link.c
@@ -1,4 +1,4 @@
-// RUN: %clang -target i686-pc-windows-msvc -### %s 2>&1 | FileCheck --check-prefix=BASIC %s
+// RUN: %clang -target i686-pc-windows-msvc -fuse-ld=link -### %s 2>&1 | FileCheck --check-prefix=BASIC %s
 // BASIC: link.exe"
 // BASIC: "-out:a.exe"
 // BASIC: "-defaultlib:libcmt"
@@ -6,7 +6,7 @@
 // BASIC: "-nologo"
 // BASIC-NOT: "-Brepro"
 
-// RUN: %clang -target i686-pc-windows-msvc -shared -o a.dll -### %s 2>&1 | FileCheck --check-prefix=DLL %s
+// RUN: %clang -target i686-pc-windows-msvc -shared -o a.dll -fuse-ld=link -### %s 2>&1 | FileCheck --check-prefix=DLL %s
 // DLL: link.exe"
 // DLL: "-out:a.dll"
 // DLL: "-defaultlib:libcmt"
@@ -19,13 +19,13 @@
 // LIBPATH: "-libpath:/usr/lib"
 // LIBPATH: "-nologo"
 
-// RUN: %clang_cl /Brepro -### -- %s 2>&1 | FileCheck --check-prefix=REPRO %s
+// RUN: %clang_cl /Brepro -fuse-ld=link -### -- %s 2>&1 | FileCheck --check-prefix=REPRO %s
 // REPRO: link.exe"
 // REPRO: "-out:msvc-link.exe"
 // REPRO: "-nologo"
 // REPRO: "-Brepro"
 
-// RUN: %clang_cl /Brepro- -### -- %s 2>&1 | FileCheck --check-prefix=NOREPRO %s
+// RUN: %clang_cl /Brepro- -fuse-ld=link -### -- %s 2>&1 | FileCheck --check-prefix=NOREPRO %s
 // NOREPRO: link.exe"
 // NOREPRO: "-out:msvc-link.exe"
 // NOREPRO: "-nologo"
diff --git a/clang/test/OpenMP/linking.c b/clang/test/OpenMP/linking.c
index 802553c1be75..1c4439626470 100644
--- a/clang/test/OpenMP/linking.c
+++ b/clang/test/OpenMP/linking.c
@@ -81,7 +81,7 @@
 // CHECK-LD-OVERRIDE-64: "-lgomp" "-lrt"
 // CHECK-LD-OVERRIDE-64: "-lpthread" "-lc"
 //
-// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN: %clang -no-canonical-prefixes -fuse-ld=link %s -### -o %t.o 2>&1 \
 // RUN:     -fopenmp=libomp -target x86_64-msvc-win32 -rtlib=platform \
 // RUN:   | FileCheck --check-prefix=CHECK-MSVC-LINK-64 %s
 // CHECK-MSVC-LINK-64: link.exe
@@ -95,7 +95,7 @@
 // SIMD-ONLY11-NOT: libomp
 // SIMD-ONLY11-NOT: libgomp
 //
-// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN: %clang -no-canonical-prefixes %s -fuse-ld=link -### -o %t.o 2>&1 \
 // RUN:     -fopenmp=libiomp5 -target x86_64-msvc-win32 -rtlib=platform \
 // RUN:   | FileCheck --check-prefix=CHECK-MSVC-ILINK-64 %s
 

From f4c01f33f450f654a63363b4eb84bf744c24959c Mon Sep 17 00:00:00 2001
From: Alexandre Ganea <alexandre.ganea@ubisoft.com>
Date: Wed, 24 Mar 2021 12:28:00 -0400
Subject: [PATCH 192/318] [Support] Fix 'keeping' temporary files on Windows 7

As reported here: https://bugs.llvm.org/show_bug.cgi?id=48378#c0
and here: https://github.com/rust-lang/rust/issues/81051
since 79657e2339b58bc01fe1b85a448bb073d57d90bb, some programs such as llvm-ar
don't work properly on Windows 7.

The issue is shown in the snippet by Oleksandr Prodan:
https://pastebin.com/v51m3uBU

In essence, once the 'DeleteFile' flag has been set on FILE_DISPOSITION_INFO,
the file path can't be queried anymore with GetFinalPathNameByHandleW. This
however works on Windows 10, GetFinalPathNameByHandleW would return sucessfully.

To workaround the issue, we simply reset the 'DeleteFile' flag before even
checking if we're dealing with a network file.

Tested with `llvm-ar r empty.a a.obj` ran on a network mount. At the moment, we
cannot specifically add a test coverage for this, since it requres mounting a
network drive.

(cherry picked from commit 64ab2b6825c5aeae6e4afa7ef0829b89a6828102)
---
 llvm/lib/Support/Windows/Path.inc | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Support/Windows/Path.inc b/llvm/lib/Support/Windows/Path.inc
index dc9bcf868381..adcbd1b5f8f3 100644
--- a/llvm/lib/Support/Windows/Path.inc
+++ b/llvm/lib/Support/Windows/Path.inc
@@ -402,8 +402,22 @@ std::error_code is_local(int FD, bool &Result) {
 }
 
 static std::error_code setDeleteDisposition(HANDLE Handle, bool Delete) {
-  // First, check if the file is on a network (non-local) drive. If so, don't
-  // set DeleteFile to true, since it prevents opening the file for writes.
+  // Clear the FILE_DISPOSITION_INFO flag first, before checking if it's a
+  // network file. On Windows 7 the function realPathFromHandle() below fails
+  // if the FILE_DISPOSITION_INFO flag was already set to 'DeleteFile = true' by
+  // a prior call.
+  FILE_DISPOSITION_INFO Disposition;
+  Disposition.DeleteFile = false;
+  if (!SetFileInformationByHandle(Handle, FileDispositionInfo, &Disposition,
+                                  sizeof(Disposition)))
+    return mapWindowsError(::GetLastError());
+  if (!Delete)
+    return std::error_code();
+
+  // Check if the file is on a network (non-local) drive. If so, don't
+  // continue when DeleteFile is true, since it prevents opening the file for
+  // writes. Note -- this will leak temporary files on disk, but only when the
+  // target file is on a network drive.
   SmallVector<wchar_t, 128> FinalPath;
   if (std::error_code EC = realPathFromHandle(Handle, FinalPath))
     return EC;
@@ -415,9 +429,9 @@ static std::error_code setDeleteDisposition(HANDLE Handle, bool Delete) {
   if (!IsLocal)
     return std::error_code();
 
-  // The file is on a local drive, set the DeleteFile to true.
-  FILE_DISPOSITION_INFO Disposition;
-  Disposition.DeleteFile = Delete;
+  // The file is on a local drive, we can safely set FILE_DISPOSITION_INFO's
+  // flag.
+  Disposition.DeleteFile = true;
   if (!SetFileInformationByHandle(Handle, FileDispositionInfo, &Disposition,
                                   sizeof(Disposition)))
     return mapWindowsError(::GetLastError());

From e94372d1b395a6461e7d973917b3a3c29563a5e6 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Fri, 12 Mar 2021 07:56:54 -0500
Subject: [PATCH 193/318] [SimplifyCFG] avoid sinking insts within an
 infinite-loop

The test is reduced from a C source example in:
https://llvm.org/PR49541

It's possible that the test could be reduced further or
the predicate generalized further, but it seems to require
a few ingredients (including the "late" SimplifyCFG options
on the RUN line) to fall into the infinite-loop trap.

(cherry picked from commit bd197ed0a57a82187ed3c6265ca811d412acfaef)
---
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     | 19 ++++---
 .../Transforms/SimplifyCFG/sink-inf-loop.ll   | 49 +++++++++++++++++++
 2 files changed, 61 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/Transforms/SimplifyCFG/sink-inf-loop.ll

diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 7cfe17618cde..de9560df9785 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -1628,6 +1628,11 @@ static bool canSinkInstructions(
         I->getType()->isTokenTy())
       return false;
 
+    // Do not try to sink an instruction in an infinite loop - it can cause
+    // this algorithm to infinite loop.
+    if (I->getParent()->getSingleSuccessor() == I->getParent())
+      return false;
+
     // Conservatively return false if I is an inline-asm instruction. Sinking
     // and merging inline-asm instructions can potentially create arguments
     // that cannot satisfy the inline-asm constraints.
@@ -1714,13 +1719,13 @@ static bool canSinkInstructions(
   return true;
 }
 
-// Assuming canSinkLastInstruction(Blocks) has returned true, sink the last
+// Assuming canSinkInstructions(Blocks) has returned true, sink the last
 // instruction of every block in Blocks to their common successor, commoning
 // into one instruction.
 static bool sinkLastInstruction(ArrayRef<BasicBlock*> Blocks) {
   auto *BBEnd = Blocks[0]->getTerminator()->getSuccessor(0);
 
-  // canSinkLastInstruction returning true guarantees that every block has at
+  // canSinkInstructions returning true guarantees that every block has at
   // least one non-terminator instruction.
   SmallVector<Instruction*,4> Insts;
   for (auto *BB : Blocks) {
@@ -1733,9 +1738,9 @@ static bool sinkLastInstruction(ArrayRef<BasicBlock*> Blocks) {
   }
 
   // The only checking we need to do now is that all users of all instructions
-  // are the same PHI node. canSinkLastInstruction should have checked this but
-  // it is slightly over-aggressive - it gets confused by commutative instructions
-  // so double-check it here.
+  // are the same PHI node. canSinkInstructions should have checked this but
+  // it is slightly over-aggressive - it gets confused by commutative
+  // instructions so double-check it here.
   Instruction *I0 = Insts.front();
   if (!I0->user_empty()) {
     auto *PNUse = dyn_cast<PHINode>(*I0->user_begin());
@@ -1746,11 +1751,11 @@ static bool sinkLastInstruction(ArrayRef<BasicBlock*> Blocks) {
       return false;
   }
 
-  // We don't need to do any more checking here; canSinkLastInstruction should
+  // We don't need to do any more checking here; canSinkInstructions should
   // have done it all for us.
   SmallVector<Value*, 4> NewOperands;
   for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O) {
-    // This check is different to that in canSinkLastInstruction. There, we
+    // This check is different to that in canSinkInstructions. There, we
     // cared about the global view once simplifycfg (and instcombine) have
     // completed - it takes into account PHIs that become trivially
     // simplifiable.  However here we need a more local view; if an operand
diff --git a/llvm/test/Transforms/SimplifyCFG/sink-inf-loop.ll b/llvm/test/Transforms/SimplifyCFG/sink-inf-loop.ll
new file mode 100644
index 000000000000..37399367efce
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/sink-inf-loop.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt %s -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -keep-loops=false -sink-common-insts=true -S | FileCheck %s
+
+; This would infinite-loop because we allowed code sinking to examine an infinite-loop block (%j).
+
+define void @PR49541(i32* %t1, i32 %a, i1 %bool) {
+; CHECK-LABEL: @PR49541(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[I:%.*]]
+; CHECK:       j:
+; CHECK-NEXT:    [[T3:%.*]] = phi i32 [ [[B:%.*]], [[J:%.*]] ], [ [[A:%.*]], [[COND_TRUE:%.*]] ], [ [[A]], [[COND_FALSE:%.*]] ]
+; CHECK-NEXT:    [[T2:%.*]] = phi i32 [ [[T2]], [[J]] ], [ [[PRE2:%.*]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ]
+; CHECK-NEXT:    [[B]] = load i32, i32* [[T1:%.*]], align 4
+; CHECK-NEXT:    br label [[J]]
+; CHECK:       i:
+; CHECK-NEXT:    [[G_1:%.*]] = phi i16 [ undef, [[ENTRY:%.*]] ], [ [[G_1]], [[COND_FALSE]] ]
+; CHECK-NEXT:    br i1 [[BOOL:%.*]], label [[COND_FALSE]], label [[COND_TRUE]]
+; CHECK:       cond.true:
+; CHECK-NEXT:    [[TOBOOL9_NOT:%.*]] = icmp eq i16 [[G_1]], 0
+; CHECK-NEXT:    [[PRE2]] = load i32, i32* [[T1]], align 4
+; CHECK-NEXT:    br label [[J]]
+; CHECK:       cond.false:
+; CHECK-NEXT:    [[T5:%.*]] = load i32, i32* [[T1]], align 4
+; CHECK-NEXT:    [[B2:%.*]] = icmp eq i32 [[T5]], 0
+; CHECK-NEXT:    br i1 [[B2]], label [[J]], label [[I]]
+;
+entry:
+  br label %i
+
+j:
+  %t3 = phi i32 [ %b, %j ], [ %a, %cond.true ], [ %a, %cond.false ]
+  %t2 = phi i32 [ %t2, %j ], [ %pre2, %cond.true ], [ 0, %cond.false ]
+  %b = load i32, i32* %t1, align 4
+  br label %j
+
+i:
+  %g.1 = phi i16 [ undef, %entry ], [ %g.1, %cond.false ]
+  br i1 %bool, label %cond.false, label %cond.true
+
+cond.true:
+  %tobool9.not = icmp eq i16 %g.1, 0
+  %pre2 = load i32, i32* %t1, align 4
+  br label %j
+
+cond.false:
+  %t5 = load i32, i32* %t1, align 4
+  %b2 = icmp eq i32 %t5, 0
+  br i1 %b2, label %j, label %i
+}

From f43958b7c497c526b238607624ee0069888f4c98 Mon Sep 17 00:00:00 2001
From: Shilei Tian <tianshilei1992@gmail.com>
Date: Thu, 18 Mar 2021 18:25:21 -0400
Subject: [PATCH 194/318] [OpenMP] Fixed a crash in hidden helper thread

It is reported that after enabling hidden helper thread, the program
can hit the assertion `new_gtid < __kmp_threads_capacity` sometimes. The root
cause is explained as follows. Let's say the default `__kmp_threads_capacity` is
`N`. If hidden helper thread is enabled, `__kmp_threads_capacity` will be offset
to `N+8` by default. If the number of threads we need exceeds `N+8`, e.g. via
`num_threads` clause, we need to expand `__kmp_threads`. In
`__kmp_expand_threads`, the expansion starts from `__kmp_threads_capacity`, and
repeatedly doubling it until the new capacity meets the requirement. Let's
assume the new requirement is `Y`.  If `Y` happens to meet the constraint
`(N+8)*2^X=Y` where `X` is the number of iterations, the new capacity is not
enough because we have 8 slots for hidden helper threads.

Here is an example.
```
#include <vector>

int main(int argc, char *argv[]) {
  constexpr const size_t N = 1344;
  std::vector<int> data(N);

#pragma omp parallel for
  for (unsigned i = 0; i < N; ++i) {
    data[i] = i;
  }

#pragma omp parallel for num_threads(N)
  for (unsigned i = 0; i < N; ++i) {
    data[i] += i;
  }

  return 0;
}
```
My CPU is 20C40T, then `__kmp_threads_capacity` is 160. After offset,
`__kmp_threads_capacity` becomes 168. `1344 = (160+8)*2^3`, then the assertions
hit.

Reviewed By: protze.joachim

Differential Revision: https://reviews.llvm.org/D98838

(cherry picked from commit 2df65f87c1ea81008768e14522e5d9277234ba70)
---
 openmp/runtime/src/kmp_runtime.cpp            | 15 ++++++-
 openmp/runtime/src/kmp_settings.cpp           |  7 +--
 .../capacity_mix_threads.cpp                  | 45 +++++++++++++++++++
 .../hidden_helper_task/capacity_nthreads.cpp  | 31 +++++++++++++
 4 files changed, 94 insertions(+), 4 deletions(-)
 create mode 100644 openmp/runtime/test/tasking/hidden_helper_task/capacity_mix_threads.cpp
 create mode 100644 openmp/runtime/test/tasking/hidden_helper_task/capacity_nthreads.cpp

diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp
index a6e32bd008e1..b981f8740dbe 100644
--- a/openmp/runtime/src/kmp_runtime.cpp
+++ b/openmp/runtime/src/kmp_runtime.cpp
@@ -920,6 +920,12 @@ static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
   if (TCR_PTR(__kmp_threads[0]) == NULL) {
     --capacity;
   }
+  // If it is not for initializing the hidden helper team, we need to take
+  // __kmp_hidden_helper_threads_num out of the capacity because it is included
+  // in __kmp_threads_capacity.
+  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
+    capacity -= __kmp_hidden_helper_threads_num;
+  }
   if (__kmp_nth + new_nthreads -
           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
       capacity) {
@@ -3632,6 +3638,13 @@ int __kmp_register_root(int initial_thread) {
     --capacity;
   }
 
+  // If it is not for initializing the hidden helper team, we need to take
+  // __kmp_hidden_helper_threads_num out of the capacity because it is included
+  // in __kmp_threads_capacity.
+  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
+    capacity -= __kmp_hidden_helper_threads_num;
+  }
+
   /* see if there are too many threads */
   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
     if (__kmp_tp_cached) {
@@ -3664,7 +3677,7 @@ int __kmp_register_root(int initial_thread) {
     /* find an available thread slot */
     // Don't reassign the zero slot since we need that to only be used by
     // initial thread. Slots for hidden helper threads should also be skipped.
-    if (initial_thread && __kmp_threads[0] == NULL) {
+    if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
       gtid = 0;
     } else {
       for (gtid = __kmp_hidden_helper_threads_num + 1;
diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp
index b477edbbfb42..50f6a05faaf5 100644
--- a/openmp/runtime/src/kmp_settings.cpp
+++ b/openmp/runtime/src/kmp_settings.cpp
@@ -504,9 +504,10 @@ int __kmp_initial_threads_capacity(int req_nproc) {
     nth = (4 * __kmp_xproc);
 
   // If hidden helper task is enabled, we initialize the thread capacity with
-  // extra
-  // __kmp_hidden_helper_threads_num.
-  nth += __kmp_hidden_helper_threads_num;
+  // extra __kmp_hidden_helper_threads_num.
+  if (__kmp_enable_hidden_helper) {
+    nth += __kmp_hidden_helper_threads_num;
+  }
 
   if (nth > __kmp_max_nth)
     nth = __kmp_max_nth;
diff --git a/openmp/runtime/test/tasking/hidden_helper_task/capacity_mix_threads.cpp b/openmp/runtime/test/tasking/hidden_helper_task/capacity_mix_threads.cpp
new file mode 100644
index 000000000000..776aee9d8e2c
--- /dev/null
+++ b/openmp/runtime/test/tasking/hidden_helper_task/capacity_mix_threads.cpp
@@ -0,0 +1,45 @@
+// RUN: %libomp-cxx-compile-and-run
+
+#include <omp.h>
+
+#include <algorithm>
+#include <cassert>
+#include <chrono>
+#include <thread>
+#include <vector>
+
+void dummy_root() {
+  // omp_get_max_threads() will do middle initialization
+  int nthreads = omp_get_max_threads();
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+}
+
+int main(int argc, char *argv[]) {
+  const int N = std::min(std::max(std::max(32, 4 * omp_get_max_threads()),
+                                  4 * omp_get_num_procs()),
+                         std::numeric_limits<int>::max());
+
+  std::vector<int> data(N);
+
+  // Create a new thread to initialize the OpenMP RTL. The new thread will not
+  // be taken as the "initial thread".
+  std::thread root(dummy_root);
+
+#pragma omp parallel for num_threads(N)
+  for (unsigned i = 0; i < N; ++i) {
+    data[i] = i;
+  }
+
+#pragma omp parallel for num_threads(N + 1)
+  for (unsigned i = 0; i < N; ++i) {
+    data[i] += i;
+  }
+
+  for (unsigned i = 0; i < N; ++i) {
+    assert(data[i] == 2 * i);
+  }
+
+  root.join();
+
+  return 0;
+}
diff --git a/openmp/runtime/test/tasking/hidden_helper_task/capacity_nthreads.cpp b/openmp/runtime/test/tasking/hidden_helper_task/capacity_nthreads.cpp
new file mode 100644
index 000000000000..a9d394f729e9
--- /dev/null
+++ b/openmp/runtime/test/tasking/hidden_helper_task/capacity_nthreads.cpp
@@ -0,0 +1,31 @@
+// RUN: %libomp-cxx-compile-and-run
+
+#include <omp.h>
+
+#include <algorithm>
+#include <cassert>
+#include <vector>
+
+int main(int argc, char *argv[]) {
+  const int N = std::min(std::max(std::max(32, 4 * omp_get_max_threads()),
+                                  4 * omp_get_num_procs()),
+                         std::numeric_limits<int>::max());
+
+  std::vector<int> data(N);
+
+#pragma omp parallel for num_threads(N)
+  for (unsigned i = 0; i < N; ++i) {
+    data[i] = i;
+  }
+
+#pragma omp parallel for num_threads(N + 1)
+  for (unsigned i = 0; i < N; ++i) {
+    data[i] += i;
+  }
+
+  for (unsigned i = 0; i < N; ++i) {
+    assert(data[i] == 2 * i);
+  }
+
+  return 0;
+}

From 8ca56905dd9bdade269b5bc91528495884b62bf5 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Wed, 10 Mar 2021 14:37:09 +0100
Subject: [PATCH 195/318] [PowerPC] Fix infinite loop in peephole CR
 optimization (PR49509)

If we encounter a degenerate select node where both operands are
the same, then we can continue negating the condition while swapping
operands, resulting in an infinite loop. Avoid this by bailing out
if both operands are the same.

Fixes https://bugs.llvm.org/show_bug.cgi?id=49509.

Differential Revision: https://reviews.llvm.org/D98340

(cherry picked from commit 2489cbaa8057c736475fd88990f4f6dbf022873d)
---
 llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp |  6 ++
 llvm/test/CodeGen/PowerPC/pr49509.ll        | 81 +++++++++++++++++++++
 2 files changed, 87 insertions(+)
 create mode 100644 llvm/test/CodeGen/PowerPC/pr49509.ll

diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 693b0adaede4..2604218da160 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -5896,7 +5896,13 @@ bool PPCDAGToDAGISel::AllUsersSelectZero(SDNode *N) {
         User->getMachineOpcode() != PPC::SELECT_I8)
       return false;
 
+    SDNode *Op1 = User->getOperand(1).getNode();
     SDNode *Op2 = User->getOperand(2).getNode();
+    // If we have a degenerate select with two equal operands, swapping will
+    // not do anything, and we may run into an infinite loop.
+    if (Op1 == Op2)
+      return false;
+
     if (!Op2->isMachineOpcode())
       return false;
 
diff --git a/llvm/test/CodeGen/PowerPC/pr49509.ll b/llvm/test/CodeGen/PowerPC/pr49509.ll
new file mode 100644
index 000000000000..f13733c18047
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/pr49509.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=powerpc-unknown-linux-gnu < %s | FileCheck %s
+
+target datalayout = "E-m:e-p:32:32-i64:64-n32"
+
+define void @test() {
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    bc 12, 20, .LBB0_2
+; CHECK-NEXT:  # %bb.1: # %bb2
+; CHECK-NEXT:    li 3, 0
+; CHECK-NEXT:    stw 3, 0(3)
+; CHECK-NEXT:    lis 3, 256
+; CHECK-NEXT:    stw 3, 0(3)
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB0_2: # %bb1
+; CHECK-NEXT:    bclr 4, 20, 0
+; CHECK-NEXT:  # %bb.3: # %bb66
+; CHECK-NEXT:    lwz 4, 12(0)
+; CHECK-NEXT:    lwz 5, 8(0)
+; CHECK-NEXT:    lwz 6, 0(0)
+; CHECK-NEXT:    lwz 7, 4(0)
+; CHECK-NEXT:    lbz 3, 0(3)
+; CHECK-NEXT:    and 5, 5, 6
+; CHECK-NEXT:    and 4, 4, 7
+; CHECK-NEXT:    and 4, 4, 5
+; CHECK-NEXT:    cmpwi 3, 0
+; CHECK-NEXT:    lis 3, 256
+; CHECK-NEXT:    lis 7, 512
+; CHECK-NEXT:    bc 12, 2, .LBB0_4
+; CHECK-NEXT:    b .LBB0_5
+; CHECK-NEXT:  .LBB0_4: # %bb66
+; CHECK-NEXT:    li 3, 0
+; CHECK-NEXT:  .LBB0_5: # %bb66
+; CHECK-NEXT:    cmpwi 1, 4, -1
+; CHECK-NEXT:    cmpwi 5, 4, -1
+; CHECK-NEXT:    li 6, 0
+; CHECK-NEXT:    bc 12, 6, .LBB0_6
+; CHECK-NEXT:    b .LBB0_7
+; CHECK-NEXT:  .LBB0_6: # %bb66
+; CHECK-NEXT:    addi 3, 7, 0
+; CHECK-NEXT:  .LBB0_7: # %bb66
+; CHECK-NEXT:    cror 20, 22, 2
+; CHECK-NEXT:    stw 3, 0(3)
+; CHECK-NEXT:    bc 12, 20, .LBB0_9
+; CHECK-NEXT:  # %bb.8: # %bb66
+; CHECK-NEXT:    ori 3, 6, 0
+; CHECK-NEXT:    b .LBB0_10
+; CHECK-NEXT:  .LBB0_9: # %bb66
+; CHECK-NEXT:    li 3, 0
+; CHECK-NEXT:  .LBB0_10: # %bb66
+; CHECK-NEXT:    stw 3, 0(3)
+; CHECK-NEXT:    blr
+bb:
+  br i1 undef, label %bb2, label %bb1
+
+bb2:                                              ; preds = %bb
+  %i = select i1 undef, i64 0, i64 72057594037927936
+  store i64 %i, i64* undef, align 8
+  ret void
+
+bb1:                                              ; preds = %bb
+  %i50 = load i8, i8* undef, align 8
+  %i52 = load i128, i128* null, align 8
+  %i62 = icmp eq i8 %i50, 0
+  br i1 undef, label %bb66, label %bb64
+
+bb64:                                             ; preds = %bb63
+  ret void
+
+bb66:                                             ; preds = %bb63
+  %i67 = lshr i128 -1, 0
+  %i68 = xor i128 %i52, -1
+  %i69 = add i128 0, %i68
+  %i70 = and i128 %i67, %i69
+  %i71 = icmp eq i128 %i70, 0
+  %i74 = select i1 %i62, i64 0, i64 72057594037927936
+  %i75 = select i1 %i71, i64 144115188075855872, i64 %i74
+  store i64 %i75, i64* undef, align 8
+  ret void
+}

From e89cdf8937bb6017cc99b05823428dd2fd673368 Mon Sep 17 00:00:00 2001
From: Joseph Huber <jhuber6@vols.utk.edu>
Date: Wed, 10 Mar 2021 13:25:33 -0500
Subject: [PATCH 196/318] [OpenMP] Restore backwards compatibility for
 libomptarget

Summary:
The changes introduced in D87946 changed the API for libomptarget
functions. `__kmpc_push_target_tripcount` was a function in Clang 11.x
but was not given a backward-compatible interface. This change will
require people using Clang 13.x or 12.x to recompile their offloading
programs.

Reviewed By: jdoerfert cchen

Differential Revision: https://reviews.llvm.org/D98358

(cherry picked from commit 807466ef28125cf7268c860b09d5563c9c93602a)
---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp                  |  2 +-
 .../target_teams_distribute_parallel_for_codegen.cpp   |  4 ++--
 ...target_teams_distribute_parallel_for_if_codegen.cpp | 10 +++++-----
 ...get_teams_distribute_parallel_for_order_codegen.cpp |  2 +-
 ...rget_teams_distribute_parallel_for_simd_codegen.cpp |  2 +-
 ...t_teams_distribute_parallel_for_simd_if_codegen.cpp | 10 +++++-----
 clang/test/OpenMP/teams_distribute_codegen.cpp         |  2 +-
 .../OpenMP/teams_distribute_parallel_for_codegen.cpp   |  2 +-
 .../teams_distribute_parallel_for_simd_codegen.cpp     |  2 +-
 clang/test/OpenMP/teams_distribute_simd_codegen.cpp    |  2 +-
 llvm/include/llvm/Frontend/OpenMP/OMPKinds.def         |  4 ++--
 llvm/test/Transforms/OpenMP/add_attributes.ll          |  6 +++---
 openmp/libomptarget/include/omptarget.h                |  6 ++++--
 openmp/libomptarget/src/exports                        |  3 ++-
 openmp/libomptarget/src/interface.cpp                  |  7 ++++++-
 openmp/libomptarget/src/omptarget.cpp                  |  4 ++--
 16 files changed, 38 insertions(+), 30 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 57cc2d60e2af..83dfa0780547 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -9892,7 +9892,7 @@ void CGOpenMPRuntime::emitTargetNumIterationsCall(
       llvm::Value *Args[] = {RTLoc, DeviceID, NumIterations};
       CGF.EmitRuntimeCall(
           OMPBuilder.getOrCreateRuntimeFunction(
-              CGM.getModule(), OMPRTL___kmpc_push_target_tripcount),
+              CGM.getModule(), OMPRTL___kmpc_push_target_tripcount_mapper),
           Args);
     }
   };
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_codegen.cpp
index 0229ace911f8..c0f53239aa13 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_codegen.cpp
@@ -39,7 +39,7 @@
 
 #ifdef CK1
 
-// HCK_NO_TGT-NOT: @__kmpc_push_target_tripcount
+// HCK_NO_TGT-NOT: @__kmpc_push_target_tripcount_mapper
 
 // HCK1: define{{.*}} i32 @{{.+}}target_teams_fun{{.*}}(
 int target_teams_fun(int *g){
@@ -60,7 +60,7 @@ int target_teams_fun(int *g){
   // HCK1: [[N_PAR:%.+]] = load{{.+}}, {{.+}} [[N_CAST]],
   // HCK1: [[TE_PAR:%.+]] = load{{.+}}, {{.+}} [[TE_CAST]],
   // HCK1: [[TH_PAR:%.+]] = load{{.+}}, {{.+}} [[TH_CAST]],
-  // HCK1: call void @__kmpc_push_target_tripcount(%struct.ident_t* @{{.+}}, i64 -1, i64 %{{.+}})
+  // HCK1: call void @__kmpc_push_target_tripcount_mapper(%struct.ident_t* @{{.+}}, i64 -1, i64 %{{.+}})
   // HCK1: call i32 @__tgt_target_teams_mapper(%struct.ident_t* @{{.+}}, i64 -1, i8* @{{[^,]+}}, i32 4, i8** %{{[^,]+}}, i8** %{{[^,]+}},
 
   // HCK1: call void @[[OFFL1:.+]](i{{32|64}} [[N_PAR]], {{.+}}, i{{32|64}} [[TE_PAR]], i{{32|64}} [[TH_PAR]])
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_if_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_if_codegen.cpp
index 6650e0557511..efe7df819fb6 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_if_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_if_codegen.cpp
@@ -49,10 +49,10 @@ int Arg;
 
 // CHECK-LABEL: define {{.*}}void @{{.+}}gtid_test
 void gtid_test() {
-// CHECK: call void @__kmpc_push_target_tripcount(%struct.ident_t* @{{.+}}, i64 -1, i64 100)
+// CHECK: call void @__kmpc_push_target_tripcount_mapper(%struct.ident_t* @{{.+}}, i64 -1, i64 100)
 // CHECK: call i{{[0-9]+}} @__tgt_target_teams_mapper(%struct.ident_t* @{{.+}},
 // CHECK: call void [[OFFLOADING_FUN_0:@.+]](
-// CHECK: call void @__kmpc_push_target_tripcount(%struct.ident_t* @{{.+}}, i64 -1, i64 100)
+// CHECK: call void @__kmpc_push_target_tripcount_mapper(%struct.ident_t* @{{.+}}, i64 -1, i64 100)
 // CHECK: call i{{[0-9]+}} @__tgt_target_teams_mapper(%struct.ident_t* @{{.+}},
 // CHECK: call void [[OFFLOADING_FUN_1:@.+]](
 #pragma omp target teams distribute parallel for
@@ -107,12 +107,12 @@ int tmain(T Arg) {
 
 // CHECK-LABEL: define {{.*}}i{{[0-9]+}} @main()
 int main() {
-// CHECK: call void @__kmpc_push_target_tripcount(%struct.ident_t* @{{.+}}, i64 -1, i64 100)
+// CHECK: call void @__kmpc_push_target_tripcount_mapper(%struct.ident_t* @{{.+}}, i64 -1, i64 100)
 // CHECK: call i{{[0-9]+}} @__tgt_target_teams_mapper(%struct.ident_t* @{{.+}},
 // CHECK: call void [[OFFLOADING_FUN_0:@.+]](
-// CHECK-NOT: call void @__kmpc_push_target_tripcount(%struct.ident_t* @{{.+}}, i64 -1, i64 100)
+// CHECK-NOT: call void @__kmpc_push_target_tripcount_mapper(%struct.ident_t* @{{.+}}, i64 -1, i64 100)
 // CHECK: call void [[OFFLOADING_FUN_1:@.+]](
-// CHECK: call void @__kmpc_push_target_tripcount(%struct.ident_t* @{{.+}}, i64 -1, i64 100)
+// CHECK: call void @__kmpc_push_target_tripcount_mapper(%struct.ident_t* @{{.+}}, i64 -1, i64 100)
 // CHECK: call i{{[0-9]+}} @__tgt_target_teams_mapper(%struct.ident_t* @{{.+}},
 // CHECK: call void [[OFFLOADING_FUN_2:@.+]](
 // CHECK: = call {{.*}}i{{.+}} @{{.+}}tmain
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_order_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_order_codegen.cpp
index b2ab37f22ec3..b99ba9d38a43 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_order_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_order_codegen.cpp
@@ -14,7 +14,7 @@
 
 // CHECK-LABEL: define {{.*}}void @{{.+}}gtid_test
 void gtid_test() {
-// CHECK: call void @__kmpc_push_target_tripcount(%struct.ident_t* @{{.+}}, i64 -1, i64 100)
+// CHECK: call void @__kmpc_push_target_tripcount_mapper(%struct.ident_t* @{{.+}}, i64 -1, i64 100)
 // CHECK: %0 = call i32 @__tgt_target_teams_mapper(%struct.ident_t* @{{.+}}, i64 -1, i8* @{{.+}}, i32 0, i8** null, i8** null, i64* null, i64* null, i8** null, i8** null, i32 0, i32 0)
 // CHECK: call void [[TARGET_OUTLINE:@.+]]()
 // CHECK: ret void
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp
index e6049145702b..39ccb87462c0 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp
@@ -60,7 +60,7 @@ int target_teams_fun(int *g){
 // HCK1: [[N_PAR:%.+]] = load{{.+}}, {{.+}} [[N_CAST]],
 // HCK1: [[TE_PAR:%.+]] = load{{.+}}, {{.+}} [[TE_CAST]],
 // HCK1: [[TH_PAR:%.+]] = load{{.+}}, {{.+}} [[TH_CAST]],
-// HCK1: call void @__kmpc_push_target_tripcount(%struct.ident_t* @{{.+}}, i64 -1, i64 %{{.+}})
+// HCK1: call void @__kmpc_push_target_tripcount_mapper(%struct.ident_t* @{{.+}}, i64 -1, i64 %{{.+}})
 // HCK1: call i32 @__tgt_target_teams_mapper(%struct.ident_t* @{{.+}}, i64 -1, i8* @{{[^,]+}}, i32 5, i8** %{{[^,]+}}, i8** %{{[^,]+}},
 
 // HCK1: call void @[[OFFL1:.+]](i{{32|64}} [[I_PAR]], i{{32|64}} [[N_PAR]], {{.+}}, i{{32|64}} [[TE_PAR]], i{{32|64}} [[TH_PAR]])
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp
index 8b0eaba07f1c..19dc15b94f64 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp
@@ -43,10 +43,10 @@ int Arg;
 
 // CHECK-LABEL: define {{.*}}void @{{.+}}gtid_test
 void gtid_test() {
-// CHECK: call void @__kmpc_push_target_tripcount(%struct.ident_t* @{{.+}}, i64 -1, i64 100)
+// CHECK: call void @__kmpc_push_target_tripcount_mapper(%struct.ident_t* @{{.+}}, i64 -1, i64 100)
 // CHECK: call i{{[0-9]+}} @__tgt_target_teams_mapper(%struct.ident_t* @{{.+}},
 // CHECK: call void [[OFFLOADING_FUN_0:@.+]](
-// CHECK: call void @__kmpc_push_target_tripcount(%struct.ident_t* @{{.+}}, i64 -1, i64 100)
+// CHECK: call void @__kmpc_push_target_tripcount_mapper(%struct.ident_t* @{{.+}}, i64 -1, i64 100)
 // CHECK: call i{{[0-9]+}} @__tgt_target_teams_mapper(%struct.ident_t* @{{.+}},
 // CHECK: call void [[OFFLOADING_FUN_1:@.+]](
 #ifdef OMP5
@@ -110,12 +110,12 @@ int tmain(T Arg) {
 
 // CHECK-LABEL: define {{.*}}i{{[0-9]+}} @main()
 int main() {
-// CHECK: call void @__kmpc_push_target_tripcount(%struct.ident_t* @{{.+}}, i64 -1, i64 100)
+// CHECK: call void @__kmpc_push_target_tripcount_mapper(%struct.ident_t* @{{.+}}, i64 -1, i64 100)
 // CHECK: call i{{[0-9]+}} @__tgt_target_teams_mapper(%struct.ident_t* @{{.+}},
 // CHECK: call void [[OFFLOADING_FUN_0:@.+]](
-// CHECK-NOT: call void @__kmpc_push_target_tripcount(%struct.ident_t* @{{.+}}, i64 -1, i64 100)
+// CHECK-NOT: call void @__kmpc_push_target_tripcount_mapper(%struct.ident_t* @{{.+}}, i64 -1, i64 100)
 // CHECK: call void [[OFFLOADING_FUN_1:@.+]](
-// CHECK: call void @__kmpc_push_target_tripcount(%struct.ident_t* @{{.+}}, i64 -1, i64 100)
+// CHECK: call void @__kmpc_push_target_tripcount_mapper(%struct.ident_t* @{{.+}}, i64 -1, i64 100)
 // CHECK: call i{{[0-9]+}} @__tgt_target_teams_mapper(%struct.ident_t* @{{.+}},
 // CHECK: call void [[OFFLOADING_FUN_2:@.+]](
 // CHECK: = call {{.*}}i{{.+}} @{{.+}}tmain
diff --git a/clang/test/OpenMP/teams_distribute_codegen.cpp b/clang/test/OpenMP/teams_distribute_codegen.cpp
index 5bbb100e669e..aab5cced4c70 100644
--- a/clang/test/OpenMP/teams_distribute_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_codegen.cpp
@@ -33,7 +33,7 @@ int teams_argument_global(int n) {
   // CK1: [[TE_PAR:%.+]] = load{{.+}}, {{.+}} [[TE_CAST]],
   // CK1: [[TH_PAR:%.+]] = load{{.+}}, {{.+}} [[TH_CAST]],
 
-  // CK1: call void @__kmpc_push_target_tripcount(%struct.ident_t* @{{.+}}, i64 -1, i64 %{{.+}})
+  // CK1: call void @__kmpc_push_target_tripcount_mapper(%struct.ident_t* @{{.+}}, i64 -1, i64 %{{.+}})
   // CK1: call i32 @__tgt_target_teams_mapper(%struct.ident_t* @{{.+}}, i64 -1, i8* @{{[^,]+}}, i32 4, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i64* {{.+}}@{{[^,]+}}, i32 0, i32 0), i8** null, i8** null, i32 {{.+}}, i32 {{.+}})
 
   // CK1: call void @[[OFFL1:.+]](i{{32|64}} [[TE_PAR]], i{{32|64}} [[TH_PAR]],
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_codegen.cpp
index b63e5aeddb7a..8fa73e76009b 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_codegen.cpp
@@ -32,7 +32,7 @@ int teams_argument_global(int n){
   // CK1: [[TH_CAST:%.+]] = alloca i{{32|64}},
   // CK1: [[TE_PAR:%.+]] = load{{.+}}, {{.+}} [[TE_CAST]],
   // CK1: [[TH_PAR:%.+]] = load{{.+}}, {{.+}} [[TH_CAST]],
-  // CK1: call void @__kmpc_push_target_tripcount(%struct.ident_t* @{{.+}}, i64 -1, i64 %{{.+}})
+  // CK1: call void @__kmpc_push_target_tripcount_mapper(%struct.ident_t* @{{.+}}, i64 -1, i64 %{{.+}})
   // CK1: call i32 @__tgt_target_teams_mapper(%struct.ident_t* @{{.+}}, i64 -1, i8* @{{[^,]+}}, i32 4, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i64* {{.+}}@{{[^,]+}}, i32 0, i32 0), i8** null, i8** null, i32 {{.+}}, i32 {{.+}})
 
   // CK1: call void @[[OFFL1:.+]](i{{32|64}} [[TE_PAR]], i{{32|64}} [[TH_PAR]],
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_codegen.cpp
index 3d479c4cd29d..9b3855c61759 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_codegen.cpp
@@ -33,7 +33,7 @@ int teams_argument_global(int n){
   // CK1: [[TE_PAR:%.+]] = load{{.+}}, {{.+}} [[TE_CAST]],
   // CK1: [[TH_PAR:%.+]] = load{{.+}}, {{.+}} [[TH_CAST]],
 
-  // CK1: call void @__kmpc_push_target_tripcount(%struct.ident_t* @{{.+}}, i64 -1, i64 %{{.+}})
+  // CK1: call void @__kmpc_push_target_tripcount_mapper(%struct.ident_t* @{{.+}}, i64 -1, i64 %{{.+}})
   // CK1: call i32 @__tgt_target_teams_mapper(%struct.ident_t* @{{.+}}, i64 -1, i8* @{{[^,]+}}, i32 4, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i64* {{.+}}@{{[^,]+}}, i32 0, i32 0), i8** null
 
   // CK1: call void @[[OFFL1:.+]](i{{32|64}} [[TE_PAR]], i{{32|64}} [[TH_PAR]],
diff --git a/clang/test/OpenMP/teams_distribute_simd_codegen.cpp b/clang/test/OpenMP/teams_distribute_simd_codegen.cpp
index fd1214d22ce9..6e5d06b0c568 100644
--- a/clang/test/OpenMP/teams_distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_simd_codegen.cpp
@@ -35,7 +35,7 @@ int teams_argument_global(int n) {
   // CK1: [[TE_PAR:%.+]] = load{{.+}}, {{.+}} [[TE_CAST]],
   // CK1: [[TH_PAR:%.+]] = load{{.+}}, {{.+}} [[TH_CAST]],
 
-  // CK1: call void @__kmpc_push_target_tripcount(%struct.ident_t* @{{.+}}, i64 -1, i64 %{{.+}})
+  // CK1: call void @__kmpc_push_target_tripcount_mapper(%struct.ident_t* @{{.+}}, i64 -1, i64 %{{.+}})
   // CK1: call i32 @__tgt_target_teams_mapper(%struct.ident_t* @{{.+}}, i64 -1, i8* @{{[^,]+}}, i32 5, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i64* {{.+}}@{{[^,]+}}, i32 0, i32 0), i8** null, i8** null, i32 {{.+}}, i32 1)
 
   // CK1: call void @[[OFFL1:.+]](i{{32|64}} [[TE_PAR]], i{{32|64}} [[TH_PAR]],
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index 844046167975..75d360bf4237 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -375,7 +375,7 @@ __OMP_RTL(__kmpc_init_allocator, false, /* omp_allocator_handle_t */ VoidPtr,
 __OMP_RTL(__kmpc_destroy_allocator, false, Void, /* Int */ Int32,
           /* omp_allocator_handle_t */ VoidPtr)
 
-__OMP_RTL(__kmpc_push_target_tripcount, false, Void, IdentPtr, Int64, Int64)
+__OMP_RTL(__kmpc_push_target_tripcount_mapper, false, Void, IdentPtr, Int64, Int64)
 __OMP_RTL(__tgt_target_mapper, false, Int32, IdentPtr, Int64, VoidPtr, Int32, VoidPtrPtr,
           VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr, VoidPtrPtr)
 __OMP_RTL(__tgt_target_nowait_mapper, false, Int32, IdentPtr, Int64, VoidPtr, Int32,
@@ -844,7 +844,7 @@ __OMP_RTL_ATTRS(__kmpc_free, AllocAttrs, AttributeSet(), {})
 __OMP_RTL_ATTRS(__kmpc_init_allocator, DefaultAttrs, ReturnPtrAttrs, {})
 __OMP_RTL_ATTRS(__kmpc_destroy_allocator, AllocAttrs, AttributeSet(), {})
 
-__OMP_RTL_ATTRS(__kmpc_push_target_tripcount, SetterAttrs, AttributeSet(), {})
+__OMP_RTL_ATTRS(__kmpc_push_target_tripcount_mapper, SetterAttrs, AttributeSet(), {})
 __OMP_RTL_ATTRS(__tgt_target_mapper, ForkAttrs, AttributeSet(), {})
 __OMP_RTL_ATTRS(__tgt_target_nowait_mapper, ForkAttrs, AttributeSet(), {})
 __OMP_RTL_ATTRS(__tgt_target_teams_mapper, ForkAttrs, AttributeSet(), {})
diff --git a/llvm/test/Transforms/OpenMP/add_attributes.ll b/llvm/test/Transforms/OpenMP/add_attributes.ll
index b294542667bd..8476f42dd529 100644
--- a/llvm/test/Transforms/OpenMP/add_attributes.ll
+++ b/llvm/test/Transforms/OpenMP/add_attributes.ll
@@ -627,7 +627,7 @@ declare i8* @__kmpc_init_allocator(i32, i8*, i32, i8*)
 
 declare void @__kmpc_destroy_allocator(i32, i8*)
 
-declare void @__kmpc_push_target_tripcount(%struct.ident_t*, i64, i64)
+declare void @__kmpc_push_target_tripcount_mapper(%struct.ident_t*, i64, i64)
 
 declare i32 @__kmpc_warp_active_thread_mask()
 
@@ -1144,7 +1144,7 @@ declare void @__kmpc_proxy_task_completed_ooo(i8*)
 ; CHECK-NEXT: declare void @__kmpc_destroy_allocator(i32, i8*)
 
 ; CHECK: ; Function Attrs: nounwind
-; CHECK-NEXT: declare void @__kmpc_push_target_tripcount(%struct.ident_t*, i64, i64)
+; CHECK-NEXT: declare void @__kmpc_push_target_tripcount_mapper(%struct.ident_t*, i64, i64)
 
 ; CHECK: ; Function Attrs: convergent nounwind
 ; CHECK-NEXT: declare i32 @__kmpc_warp_active_thread_mask()
@@ -1669,7 +1669,7 @@ declare void @__kmpc_proxy_task_completed_ooo(i8*)
 ; OPTIMISTIC-NEXT: declare void @__kmpc_destroy_allocator(i32, i8*)
 
 ; OPTIMISTIC: ; Function Attrs: inaccessiblememonly nofree nosync nounwind willreturn writeonly
-; OPTIMISTIC-NEXT: declare void @__kmpc_push_target_tripcount(%struct.ident_t*, i64, i64)
+; OPTIMISTIC-NEXT: declare void @__kmpc_push_target_tripcount_mapper(%struct.ident_t*, i64, i64)
 
 ; OPTIMISTIC: ; Function Attrs: convergent nounwind
 ; OPTIMISTIC-NEXT: declare i32 @__kmpc_warp_active_thread_mask()
diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h
index 46bb8206efa1..36c25c33798a 100644
--- a/openmp/libomptarget/include/omptarget.h
+++ b/openmp/libomptarget/include/omptarget.h
@@ -283,8 +283,10 @@ int __tgt_target_teams_nowait_mapper(
     int32_t thread_limit, int32_t depNum, void *depList, int32_t noAliasDepNum,
     void *noAliasDepList);
 
-void __kmpc_push_target_tripcount(ident_t *loc, int64_t device_id,
-                                  uint64_t loop_tripcount);
+void __kmpc_push_target_tripcount(int64_t device_id, uint64_t loop_tripcount);
+
+void __kmpc_push_target_tripcount_mapper(ident_t *loc, int64_t device_id,
+                                         uint64_t loop_tripcount);
 
 #ifdef __cplusplus
 }
diff --git a/openmp/libomptarget/src/exports b/openmp/libomptarget/src/exports
index 5e09a088533d..b7fc1c8c3c86 100644
--- a/openmp/libomptarget/src/exports
+++ b/openmp/libomptarget/src/exports
@@ -25,6 +25,8 @@ VERS1.0 {
     __tgt_target_teams_nowait_mapper;
     __tgt_mapper_num_components;
     __tgt_push_mapper_component;
+    __kmpc_push_target_tripcount;
+    __kmpc_push_target_tripcount_mapper;
     omp_get_num_devices;
     omp_get_initial_device;
     omp_target_alloc;
@@ -34,7 +36,6 @@ VERS1.0 {
     omp_target_memcpy_rect;
     omp_target_associate_ptr;
     omp_target_disassociate_ptr;
-    __kmpc_push_target_tripcount;
   local:
     *;
 };
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index 01f3715d6bcc..b97676a6981b 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -514,8 +514,13 @@ EXTERN void __tgt_push_mapper_component(void *rt_mapper_handle, void *base,
       MapComponentInfoTy(base, begin, size, type, name));
 }
 
-EXTERN void __kmpc_push_target_tripcount(ident_t *loc, int64_t device_id,
+EXTERN void __kmpc_push_target_tripcount(int64_t device_id,
                                          uint64_t loop_tripcount) {
+  __kmpc_push_target_tripcount_mapper(nullptr, device_id, loop_tripcount);
+}
+
+EXTERN void __kmpc_push_target_tripcount_mapper(ident_t *loc, int64_t device_id,
+                                                uint64_t loop_tripcount) {
   TIMESCOPE_WITH_IDENT(loc);
   if (IsOffloadDisabled())
     return;
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 37150aae2fe6..af6f7d09a4a2 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -900,8 +900,8 @@ TableMap *getTableMap(void *HostPtr) {
 
 /// Get loop trip count
 /// FIXME: This function will not work right if calling
-/// __kmpc_push_target_tripcount in one thread but doing offloading in another
-/// thread, which might occur when we call task yield.
+/// __kmpc_push_target_tripcount_mapper in one thread but doing offloading in
+/// another thread, which might occur when we call task yield.
 uint64_t getLoopTripCount(int64_t DeviceId) {
   DeviceTy &Device = PM->Devices[DeviceId];
   uint64_t LoopTripCount = 0;

From f05b649610564b11c481a20598dbb3f532c4602a Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Sun, 7 Mar 2021 17:27:22 +0100
Subject: [PATCH 197/318] [FastISel] Don't trivially kill extractvalues
 (PR49467)

All extractvalues of the same value at the same index will map to
the same register, so even if one specific extractvalue only has
one use, we should not mark it as a trivial kill, as there may be
more extractvalues later.

Fixes https://bugs.llvm.org/show_bug.cgi?id=49467.

Differential Revision: https://reviews.llvm.org/D98145

(cherry picked from commit 55ae279ba7a5905f39ce3ae79eac7834a4a134cc)
---
 llvm/include/llvm/CodeGen/FastISel.h       |  5 +++-
 llvm/lib/CodeGen/SelectionDAG/FastISel.cpp | 10 +++++---
 llvm/test/CodeGen/X86/pr49467.ll           | 27 ++++++++++++++++++++++
 3 files changed, 38 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/pr49467.ll

diff --git a/llvm/include/llvm/CodeGen/FastISel.h b/llvm/include/llvm/CodeGen/FastISel.h
index 81c1d6aad49a..26bf4ab2618c 100644
--- a/llvm/include/llvm/CodeGen/FastISel.h
+++ b/llvm/include/llvm/CodeGen/FastISel.h
@@ -490,7 +490,10 @@ class FastISel {
   /// - \c Add has a constant operand.
   bool canFoldAddIntoGEP(const User *GEP, const Value *Add);
 
-  /// Test whether the given value has exactly one use.
+  /// Test whether the register associated with this value has exactly one use,
+  /// in which case that single use is killing. Note that multiple IR values
+  /// may map onto the same register, in which case this is not the same as
+  /// checking that an IR value has one use.
   bool hasTrivialKill(const Value *V);
 
   /// Create a machine mem operand from the given instruction.
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index 62f7f3d98ba6..0ff77d4ba1ab 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -261,12 +261,16 @@ bool FastISel::hasTrivialKill(const Value *V) {
     if (GEP->hasAllZeroIndices() && !hasTrivialKill(GEP->getOperand(0)))
       return false;
 
+  // Casts and extractvalues may be trivially coalesced by fast-isel.
+  if (I->getOpcode() == Instruction::BitCast ||
+      I->getOpcode() == Instruction::PtrToInt ||
+      I->getOpcode() == Instruction::IntToPtr ||
+      I->getOpcode() == Instruction::ExtractValue)
+    return false;
+
   // Only instructions with a single use in the same basic block are considered
   // to have trivial kills.
   return I->hasOneUse() &&
-         !(I->getOpcode() == Instruction::BitCast ||
-           I->getOpcode() == Instruction::PtrToInt ||
-           I->getOpcode() == Instruction::IntToPtr) &&
          cast<Instruction>(*I->user_begin())->getParent() == I->getParent();
 }
 
diff --git a/llvm/test/CodeGen/X86/pr49467.ll b/llvm/test/CodeGen/X86/pr49467.ll
new file mode 100644
index 000000000000..9b3502552066
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr49467.ll
@@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 -fast-isel -verify-machineinstrs -mtriple=x86_64 < %s | FileCheck %s
+
+declare { i8*, i64 } @get()
+
+declare void @use(i8*, i64)
+
+define void @test(i64* %p) nounwind {
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    movq %rdi, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    callq get@PLT
+; CHECK-NEXT:    movq (%rsp), %rdi # 8-byte Reload
+; CHECK-NEXT:    movq %rdx, %rsi
+; CHECK-NEXT:    movq %rsi, (%rdi)
+; CHECK-NEXT:    # implicit-def: $rdi
+; CHECK-NEXT:    callq use@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+  %struct = call { i8*, i64 } @get()
+  %struct.1 = extractvalue { i8*, i64 } %struct, 1
+  store i64 %struct.1, i64* %p, align 8
+  %struct.2 = extractvalue { i8*, i64 } %struct, 1
+  call void @use(i8* undef, i64 %struct.2)
+  ret void
+}

From 79a79d1d01c4c206d8de3569c72747587d929770 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Sun, 14 Mar 2021 16:39:03 +0100
Subject: [PATCH 198/318] [X86] Add test for PR49587 (NFC)

Shows a miscompile with FastISel.

(cherry picked from commit 0d814ca0f02733d6581bf209fadbebf3035380e0)
---
 llvm/test/CodeGen/X86/pr49587.ll | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/pr49587.ll

diff --git a/llvm/test/CodeGen/X86/pr49587.ll b/llvm/test/CodeGen/X86/pr49587.ll
new file mode 100644
index 000000000000..343f1a0149c0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr49587.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 -fast-isel -mtriple=x86_64-- < %s | FileCheck %s
+
+define i32 @test(i64 %arg) nounwind {
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $1, %rdi
+; CHECK-NEXT:    setb %al
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    jb .LBB0_2
+; CHECK-NEXT:  # %bb.1: # %no_overflow
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    jmp .LBB0_2
+; CHECK-NEXT:  .LBB0_2: # %merge
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT:    retq
+entry:
+  %usubo = tail call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %arg, i64 1)
+  %overflow = extractvalue { i64, i1 } %usubo, 1
+  br i1 %overflow, label %merge, label %no_overflow
+
+no_overflow:
+  br label %merge
+
+merge:
+  %phi = phi i32 [ 1, %no_overflow ], [ 0, %entry ]
+  ret i32 %phi
+}
+
+declare { i64, i1 } @llvm.usub.with.overflow.i64(i64, i64)

From 38dd45b00431e2c065e172751492e0ded59e49e6 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Sun, 14 Mar 2021 16:47:41 +0100
Subject: [PATCH 199/318] [X86][FastISel] Fix with.overflow eflags clobber
 (PR49587)

If the successor block has a phi node, then additional moves may
be inserted into predecessors, which may clobber eflags. Don't try
to fold the with.overflow result into the branch in that case.

This is done by explicitly checking for any phis in successor
blocks, not sure if there's some more principled way to address
this. Other fused compare and branch patterns avoid the issue by
emitting the comparison when handling the branch, so that no
instructions may be inserted in between. In this case, the
with.overflow call is emitted separately (and I don't think this
is avoidable, as it will generally have at least two users).

Fixes https://bugs.llvm.org/show_bug.cgi?id=49587.

Differential Revision: https://reviews.llvm.org/D98600

(cherry picked from commit 7669455df49e6fc8ae7d9f4bd4ee95bb20e7eb6e)
---
 llvm/lib/Target/X86/X86FastISel.cpp | 8 ++++++++
 llvm/test/CodeGen/X86/pr49587.ll    | 5 +++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index caf158102230..a1a16a19f5e5 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -284,6 +284,14 @@ bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
       return false;
   }
 
+  // Make sure no potentially eflags clobbering phi moves can be inserted in
+  // between.
+  auto HasPhis = [](const BasicBlock *Succ) {
+    return !llvm::empty(Succ->phis());
+  };
+  if (I->isTerminator() && llvm::any_of(successors(I), HasPhis))
+    return false;
+
   CC = TmpCC;
   return true;
 }
diff --git a/llvm/test/CodeGen/X86/pr49587.ll b/llvm/test/CodeGen/X86/pr49587.ll
index 343f1a0149c0..7dc54a526608 100644
--- a/llvm/test/CodeGen/X86/pr49587.ll
+++ b/llvm/test/CodeGen/X86/pr49587.ll
@@ -5,10 +5,11 @@ define i32 @test(i64 %arg) nounwind {
 ; CHECK-LABEL: test:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $1, %rdi
-; CHECK-NEXT:    setb %al
+; CHECK-NEXT:    setb %cl
 ; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb $1, %cl
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    jb .LBB0_2
+; CHECK-NEXT:    jne .LBB0_2
 ; CHECK-NEXT:  # %bb.1: # %no_overflow
 ; CHECK-NEXT:    movl $1, %eax
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill

From 5b3480610383ba281ef0c7918a6c097058a408d4 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Fri, 12 Mar 2021 14:15:27 -0500
Subject: [PATCH 200/318] [InstCombine] add test for zext-of-icmps; NFC

PR49475 shows an infinite loop outcome, but this
tries to show the root cause with a minimal test.

(cherry picked from commit 579b8fc2e97c489308f97b01d13d894c03c0a16c)
---
 .../Transforms/InstCombine/zext-or-icmp.ll    | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/zext-or-icmp.ll b/llvm/test/Transforms/InstCombine/zext-or-icmp.ll
index a77aa7ac7ebd..54ae0858aa67 100644
--- a/llvm/test/Transforms/InstCombine/zext-or-icmp.ll
+++ b/llvm/test/Transforms/InstCombine/zext-or-icmp.ll
@@ -106,3 +106,23 @@ block2:
   %conv2 = zext i1 %cmp1 to i32
   ret i32 %conv2
 }
+
+; FIXME: This should not end with more instructions than it started from.
+
+define i32 @PR49475(i32 %x, i16 %y) {
+; CHECK-LABEL: @PR49475(
+; CHECK-NEXT:    [[M:%.*]] = and i16 [[Y:%.*]], 1
+; CHECK-NEXT:    [[B1:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[B11:%.*]] = zext i1 [[B1]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i16 [[M]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    [[Z3:%.*]] = or i32 [[B11]], [[TMP2]]
+; CHECK-NEXT:    ret i32 [[Z3]]
+;
+  %m = and i16 %y, 1
+  %b1 = icmp eq i32 %x, 0
+  %b2 = icmp eq i16 %m, 0
+  %t1 = or i1 %b1, %b2
+  %z = zext i1 %t1 to i32
+  ret i32 %z
+}

From ff2cf8fafa5ad9a76e59fa086d969d4e2ecc3a39 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Sat, 13 Mar 2021 08:26:27 -0500
Subject: [PATCH 201/318] [InstCombine] avoid creating an extra instruction in
 zext fold and possible inf-loop

The structure of this fold is suspect vs. most of instcombine
because it creates instructions and tries to delete them
immediately after.

If we don't have the operand types for the icmps, then we are
not behaving as assumed. And as shown in PR49475, we can inf-loop.

(cherry picked from commit 4224a36957420744756d6a6450eb6502a1bfadc3)
---
 .../InstCombine/InstCombineCasts.cpp          |  1 +
 .../Transforms/InstCombine/zext-or-icmp.ll    | 58 +++++++++++++++++--
 2 files changed, 53 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 0b53007bb6dc..07e68c44416d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1270,6 +1270,7 @@ Instruction *InstCombinerImpl::visitZExt(ZExtInst &CI) {
     ICmpInst *LHS = dyn_cast<ICmpInst>(SrcI->getOperand(0));
     ICmpInst *RHS = dyn_cast<ICmpInst>(SrcI->getOperand(1));
     if (LHS && RHS && LHS->hasOneUse() && RHS->hasOneUse() &&
+        LHS->getOperand(0)->getType() == RHS->getOperand(0)->getType() &&
         (transformZExtICmp(LHS, CI, false) ||
          transformZExtICmp(RHS, CI, false))) {
       // zext (or icmp, icmp) -> or (zext icmp), (zext icmp)
diff --git a/llvm/test/Transforms/InstCombine/zext-or-icmp.ll b/llvm/test/Transforms/InstCombine/zext-or-icmp.ll
index 54ae0858aa67..5ae3d8ea0dba 100644
--- a/llvm/test/Transforms/InstCombine/zext-or-icmp.ll
+++ b/llvm/test/Transforms/InstCombine/zext-or-icmp.ll
@@ -107,17 +107,16 @@ block2:
   ret i32 %conv2
 }
 
-; FIXME: This should not end with more instructions than it started from.
+; This should not end with more instructions than it started from.
 
 define i32 @PR49475(i32 %x, i16 %y) {
 ; CHECK-LABEL: @PR49475(
 ; CHECK-NEXT:    [[M:%.*]] = and i16 [[Y:%.*]], 1
 ; CHECK-NEXT:    [[B1:%.*]] = icmp eq i32 [[X:%.*]], 0
-; CHECK-NEXT:    [[B11:%.*]] = zext i1 [[B1]] to i32
-; CHECK-NEXT:    [[TMP1:%.*]] = xor i16 [[M]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[TMP1]] to i32
-; CHECK-NEXT:    [[Z3:%.*]] = or i32 [[B11]], [[TMP2]]
-; CHECK-NEXT:    ret i32 [[Z3]]
+; CHECK-NEXT:    [[B2:%.*]] = icmp eq i16 [[M]], 0
+; CHECK-NEXT:    [[T1:%.*]] = or i1 [[B1]], [[B2]]
+; CHECK-NEXT:    [[Z:%.*]] = zext i1 [[T1]] to i32
+; CHECK-NEXT:    ret i32 [[Z]]
 ;
   %m = and i16 %y, 1
   %b1 = icmp eq i32 %x, 0
@@ -126,3 +125,50 @@ define i32 @PR49475(i32 %x, i16 %y) {
   %z = zext i1 %t1 to i32
   ret i32 %z
 }
+
+; This would infinite-loop.
+
+define i8 @PR49475_infloop(i32 %t0, i16 %insert, i64 %e, i8 %i162) {
+; CHECK-LABEL: @PR49475_infloop(
+; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[T0:%.*]], 0
+; CHECK-NEXT:    [[B2:%.*]] = icmp eq i16 [[INSERT:%.*]], 0
+; CHECK-NEXT:    [[T1:%.*]] = or i1 [[B]], [[B2]]
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[T1]] to i32
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[EXT]], [[T0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = or i32 [[AND]], 140
+; CHECK-NEXT:    [[XOR1:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[CONV16:%.*]] = sext i8 [[I162:%.*]] to i64
+; CHECK-NEXT:    [[SUB17:%.*]] = sub i64 [[CONV16]], [[E:%.*]]
+; CHECK-NEXT:    [[SEXT:%.*]] = shl i64 [[SUB17]], 32
+; CHECK-NEXT:    [[CONV18:%.*]] = ashr exact i64 [[SEXT]], 32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i64 [[CONV18]], [[XOR1]]
+; CHECK-NEXT:    [[CONV19:%.*]] = zext i1 [[CMP]] to i16
+; CHECK-NEXT:    [[OR21:%.*]] = or i16 [[CONV19]], [[INSERT]]
+; CHECK-NEXT:    [[TRUNC44:%.*]] = trunc i16 [[OR21]] to i8
+; CHECK-NEXT:    [[INC:%.*]] = or i8 [[TRUNC44]], [[I162]]
+; CHECK-NEXT:    [[TOBOOL23_NOT:%.*]] = icmp eq i16 [[OR21]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[TOBOOL23_NOT]])
+; CHECK-NEXT:    ret i8 [[INC]]
+;
+  %b = icmp eq i32 %t0, 0
+  %b2 = icmp eq i16 %insert, 0
+  %t1 = or i1 %b, %b2
+  %ext = zext i1 %t1 to i32
+  %and = and i32 %t0, %ext
+  %conv13 = zext i32 %and to i64
+  %xor = xor i64 %conv13, 140
+  %conv16 = sext i8 %i162 to i64
+  %sub17 = sub i64 %conv16, %e
+  %sext = shl i64 %sub17, 32
+  %conv18 = ashr exact i64 %sext, 32
+  %cmp = icmp sge i64 %xor, %conv18
+  %conv19 = zext i1 %cmp to i16
+  %or21 = or i16 %insert, %conv19
+  %trunc44 = trunc i16 %or21 to i8
+  %inc = add i8 %i162, %trunc44
+  %tobool23.not = icmp eq i16 %or21, 0
+  call void @llvm.assume(i1 %tobool23.not)
+  ret i8 %inc
+}
+
+declare void @llvm.assume(i1 noundef)

From 9ae9ab1ca34384e07b751c16645e22a0b953b08b Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung@sifive.com>
Date: Tue, 30 Mar 2021 14:30:15 -0700
Subject: [PATCH 202/318] [RISCV][MC] Fix nf encoding for vector ld/st whole
 register

The three bit nf is one less than the number of NFIELDS,
so we manually decrement 1 for VS1/2/4/8R & VL1/2/4/8R.

Differential revision: https://reviews.llvm.org/D98185

(cherry picked from commit rG5cdb2e98608bf57c216ee7067e8a12d070c9e2bd)
---
 llvm/lib/Target/RISCV/RISCVInstrInfoV.td | 16 +++---
 llvm/test/MC/RISCV/rvv/aliases.s         | 16 +++---
 llvm/test/MC/RISCV/rvv/load.s            | 64 ++++++++++++------------
 llvm/test/MC/RISCV/rvv/store.s           | 16 +++---
 4 files changed, 56 insertions(+), 56 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
index 86fbc73d81d5..b3fc76aee161 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
@@ -504,19 +504,19 @@ def VSOXEI16_V : VIndexedStore<MOPSTIndexedOrder, LSWidth16, "vsoxei16.v">;
 def VSOXEI32_V : VIndexedStore<MOPSTIndexedOrder, LSWidth32, "vsoxei32.v">;
 def VSOXEI64_V : VIndexedStore<MOPSTIndexedOrder, LSWidth64, "vsoxei64.v">;
 
-defm VL1R : VWholeLoad<1, "vl1r">;
-defm VL2R : VWholeLoad<2, "vl2r">;
-defm VL4R : VWholeLoad<4, "vl4r">;
-defm VL8R : VWholeLoad<8, "vl8r">;
+defm VL1R : VWholeLoad<0, "vl1r">;
+defm VL2R : VWholeLoad<1, "vl2r">;
+defm VL4R : VWholeLoad<3, "vl4r">;
+defm VL8R : VWholeLoad<7, "vl8r">;
 def : InstAlias<"vl1r.v $vd, (${rs1})", (VL1RE8_V VR:$vd, GPR:$rs1)>;
 def : InstAlias<"vl2r.v $vd, (${rs1})", (VL2RE8_V VR:$vd, GPR:$rs1)>;
 def : InstAlias<"vl4r.v $vd, (${rs1})", (VL4RE8_V VR:$vd, GPR:$rs1)>;
 def : InstAlias<"vl8r.v $vd, (${rs1})", (VL8RE8_V VR:$vd, GPR:$rs1)>;
 
-def VS1R_V : VWholeStore<1, "vs1r.v">;
-def VS2R_V : VWholeStore<2, "vs2r.v">;
-def VS4R_V : VWholeStore<4, "vs4r.v">;
-def VS8R_V : VWholeStore<8, "vs8r.v">;
+def VS1R_V : VWholeStore<0, "vs1r.v">;
+def VS2R_V : VWholeStore<1, "vs2r.v">;
+def VS4R_V : VWholeStore<3, "vs4r.v">;
+def VS8R_V : VWholeStore<7, "vs8r.v">;
 
 // Vector Single-Width Integer Add and Subtract
 defm VADD_V : VALU_IV_V_X_I<"vadd", 0b000000>;
diff --git a/llvm/test/MC/RISCV/rvv/aliases.s b/llvm/test/MC/RISCV/rvv/aliases.s
index 2e5120c91e45..ebe9e79399a6 100644
--- a/llvm/test/MC/RISCV/rvv/aliases.s
+++ b/llvm/test/MC/RISCV/rvv/aliases.s
@@ -54,17 +54,17 @@ vmset.m v0
 # ALIAS:    vmnot.m v0, v1                  # encoding: [0x57,0xa0,0x10,0x76]
 # NO-ALIAS: vmnand.mm       v0, v1, v1      # encoding: [0x57,0xa0,0x10,0x76]
 vmnot.m v0, v1
-# ALIAS:    vl1r.v          v0, (a0)        # encoding: [0x07,0x00,0x85,0x22]
-# NO-ALIAS: vl1re8.v        v0, (a0)        # encoding: [0x07,0x00,0x85,0x22]
+# ALIAS:    vl1r.v          v0, (a0)        # encoding: [0x07,0x00,0x85,0x02]
+# NO-ALIAS: vl1re8.v        v0, (a0)        # encoding: [0x07,0x00,0x85,0x02]
 vl1r.v v0, (a0) 
-# ALIAS:    vl2r.v          v0, (a0)        # encoding: [0x07,0x00,0x85,0x42]
-# NO-ALIAS: vl2re8.v        v0, (a0)        # encoding: [0x07,0x00,0x85,0x42]
+# ALIAS:    vl2r.v          v0, (a0)        # encoding: [0x07,0x00,0x85,0x22]
+# NO-ALIAS: vl2re8.v        v0, (a0)        # encoding: [0x07,0x00,0x85,0x22]
 vl2r.v v0, (a0) 
-# ALIAS:    vl4r.v          v0, (a0)        # encoding: [0x07,0x00,0x85,0x82]
-# NO-ALIAS: vl4re8.v        v0, (a0)        # encoding: [0x07,0x00,0x85,0x82]
+# ALIAS:    vl4r.v          v0, (a0)        # encoding: [0x07,0x00,0x85,0x62]
+# NO-ALIAS: vl4re8.v        v0, (a0)        # encoding: [0x07,0x00,0x85,0x62]
 vl4r.v v0, (a0) 
-# ALIAS:    vl8r.v          v0, (a0)        # encoding: [0x07,0x00,0x85,0x02]
-# NO-ALIAS: vl8re8.v        v0, (a0)        # encoding: [0x07,0x00,0x85,0x02]
+# ALIAS:    vl8r.v          v0, (a0)        # encoding: [0x07,0x00,0x85,0xe2]
+# NO-ALIAS: vl8re8.v        v0, (a0)        # encoding: [0x07,0x00,0x85,0xe2]
 vl8r.v v0, (a0) 
 # ALIAS:    vneg.v          v2, v1, v0.t    # encoding: [0x57,0x41,0x10,0x0c]
 # NO-ALIAS: vrsub.vx        v2, v1, zero, v0.t # encoding: [0x57,0x41,0x10,0x0c]
diff --git a/llvm/test/MC/RISCV/rvv/load.s b/llvm/test/MC/RISCV/rvv/load.s
index 3d0dbb15c36e..45a3881cb60d 100644
--- a/llvm/test/MC/RISCV/rvv/load.s
+++ b/llvm/test/MC/RISCV/rvv/load.s
@@ -256,96 +256,96 @@ vloxei64.v v8, (a0), v4
 
 vl1re8.v v8, (a0)
 # CHECK-INST: vl1re8.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x04,0x85,0x22]
+# CHECK-ENCODING: [0x07,0x04,0x85,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 04 85 22 <unknown>
+# CHECK-UNKNOWN: 07 04 85 02 <unknown>
 
 vl1re16.v v8, (a0)
 # CHECK-INST: vl1re16.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x54,0x85,0x22]
+# CHECK-ENCODING: [0x07,0x54,0x85,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 54 85 22 <unknown>
+# CHECK-UNKNOWN: 07 54 85 02 <unknown>
 
 vl1re32.v v8, (a0)
 # CHECK-INST: vl1re32.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x64,0x85,0x22]
+# CHECK-ENCODING: [0x07,0x64,0x85,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 64 85 22 <unknown>
+# CHECK-UNKNOWN: 07 64 85 02 <unknown>
 
 vl1re64.v v8, (a0)
 # CHECK-INST: vl1re64.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x74,0x85,0x22]
+# CHECK-ENCODING: [0x07,0x74,0x85,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 74 85 22 <unknown>
+# CHECK-UNKNOWN: 07 74 85 02 <unknown>
 
 vl2re8.v v8, (a0)
 # CHECK-INST: vl2re8.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x04,0x85,0x42]
+# CHECK-ENCODING: [0x07,0x04,0x85,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 04 85 42 <unknown>
+# CHECK-UNKNOWN: 07 04 85 22 <unknown>
 
 vl2re16.v v8, (a0)
 # CHECK-INST: vl2re16.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x54,0x85,0x42]
+# CHECK-ENCODING: [0x07,0x54,0x85,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 54 85 42 <unknown>
+# CHECK-UNKNOWN: 07 54 85 22 <unknown>
 
 vl2re32.v v8, (a0)
 # CHECK-INST: vl2re32.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x64,0x85,0x42]
+# CHECK-ENCODING: [0x07,0x64,0x85,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 64 85 42 <unknown>
+# CHECK-UNKNOWN: 07 64 85 22 <unknown>
 
 vl2re64.v v8, (a0)
 # CHECK-INST: vl2re64.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x74,0x85,0x42]
+# CHECK-ENCODING: [0x07,0x74,0x85,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 74 85 42 <unknown>
+# CHECK-UNKNOWN: 07 74 85 22 <unknown>
 
 vl4re8.v v8, (a0)
 # CHECK-INST: vl4re8.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x04,0x85,0x82]
+# CHECK-ENCODING: [0x07,0x04,0x85,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 04 85 82 <unknown>
+# CHECK-UNKNOWN: 07 04 85 62 <unknown>
 
 vl4re16.v v8, (a0)
 # CHECK-INST: vl4re16.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x54,0x85,0x82]
+# CHECK-ENCODING: [0x07,0x54,0x85,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 54 85 82 <unknown>
+# CHECK-UNKNOWN: 07 54 85 62 <unknown>
 
 vl4re32.v v8, (a0)
 # CHECK-INST: vl4re32.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x64,0x85,0x82]
+# CHECK-ENCODING: [0x07,0x64,0x85,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 64 85 82 <unknown>
+# CHECK-UNKNOWN: 07 64 85 62 <unknown>
 
 vl4re64.v v8, (a0)
 # CHECK-INST: vl4re64.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x74,0x85,0x82]
+# CHECK-ENCODING: [0x07,0x74,0x85,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 74 85 82 <unknown>
+# CHECK-UNKNOWN: 07 74 85 62 <unknown>
 
 vl8re8.v v8, (a0)
 # CHECK-INST: vl8re8.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x04,0x85,0x02]
+# CHECK-ENCODING: [0x07,0x04,0x85,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 04 85 02 <unknown>
+# CHECK-UNKNOWN: 07 04 85 e2 <unknown>
 
 vl8re16.v v8, (a0)
 # CHECK-INST: vl8re16.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x54,0x85,0x02]
+# CHECK-ENCODING: [0x07,0x54,0x85,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 54 85 02 <unknown>
+# CHECK-UNKNOWN: 07 54 85 e2 <unknown>
 
 vl8re32.v v8, (a0)
 # CHECK-INST: vl8re32.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x64,0x85,0x02]
+# CHECK-ENCODING: [0x07,0x64,0x85,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 64 85 02 <unknown>
+# CHECK-UNKNOWN: 07 64 85 e2 <unknown>
 
 vl8re64.v v8, (a0)
 # CHECK-INST: vl8re64.v v8, (a0)
-# CHECK-ENCODING: [0x07,0x74,0x85,0x02]
+# CHECK-ENCODING: [0x07,0x74,0x85,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 07 74 85 02 <unknown>
+# CHECK-UNKNOWN: 07 74 85 e2 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/store.s b/llvm/test/MC/RISCV/rvv/store.s
index e4795aa1c2c9..b5a75ac2d008 100644
--- a/llvm/test/MC/RISCV/rvv/store.s
+++ b/llvm/test/MC/RISCV/rvv/store.s
@@ -208,24 +208,24 @@ vsoxei64.v v24, (a0), v4
 
 vs1r.v v24, (a0)
 # CHECK-INST: vs1r.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x0c,0x85,0x22]
+# CHECK-ENCODING: [0x27,0x0c,0x85,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 27 0c 85 22 <unknown>
+# CHECK-UNKNOWN: 27 0c 85 02 <unknown>
 
 vs2r.v v24, (a0)
 # CHECK-INST: vs2r.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x0c,0x85,0x42]
+# CHECK-ENCODING: [0x27,0x0c,0x85,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 27 0c 85 42 <unknown>
+# CHECK-UNKNOWN: 27 0c 85 22 <unknown>
 
 vs4r.v v24, (a0)
 # CHECK-INST: vs4r.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x0c,0x85,0x82]
+# CHECK-ENCODING: [0x27,0x0c,0x85,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 27 0c 85 82 <unknown>
+# CHECK-UNKNOWN: 27 0c 85 62 <unknown>
 
 vs8r.v v24, (a0)
 # CHECK-INST: vs8r.v v24, (a0)
-# CHECK-ENCODING: [0x27,0x0c,0x85,0x02]
+# CHECK-ENCODING: [0x27,0x0c,0x85,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Instructions)
-# CHECK-UNKNOWN: 27 0c 85 02 <unknown>
+# CHECK-UNKNOWN: 27 0c 85 e2 <unknown>

From 31001be371e8f2c74470e727e54503fb2aabec8b Mon Sep 17 00:00:00 2001
From: LemonBoy <thatlemon@gmail.com>
Date: Wed, 17 Mar 2021 16:59:55 +0100
Subject: [PATCH 203/318] [LoopVectorize] Refine hasIrregularType predicate

The `hasIrregularType` predicate checks whether an array of N values of type Ty is "bitcast-compatible" with a <N x Ty> vector.
The previous check returned invalid results in some cases where there's some padding between the array elements: eg. a 4-element array of u7 values is considered as compatible with <4 x u7>, even though the vector is only loading/storing 28 bits instead of 32.

The problem causes LLVM to generate incorrect code for some targets: for AArch64 the vector loads/stores are lowered in terms of ubfx/bfi, effectively losing the top (N * padding bits).

Reviewed By: lebedev.ri

Differential Revision: https://reviews.llvm.org/D97465

(cherry picked from commit 4f024938e4c932feba4d28573ec4522106f8d879)
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 22 +++++----------
 .../LoopVectorize/irregular_type.ll           | 27 +++++++++++++++++++
 2 files changed, 34 insertions(+), 15 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/irregular_type.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d36e078444bc..b456a97aa4ec 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -372,19 +372,11 @@ static Type *getMemInstValueType(Value *I) {
 
 /// A helper function that returns true if the given type is irregular. The
 /// type is irregular if its allocated size doesn't equal the store size of an
-/// element of the corresponding vector type at the given vectorization factor.
-static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
-  // Determine if an array of VF elements of type Ty is "bitcast compatible"
-  // with a <VF x Ty> vector.
-  if (VF.isVector()) {
-    auto *VectorTy = VectorType::get(Ty, VF);
-    return TypeSize::get(VF.getKnownMinValue() *
-                             DL.getTypeAllocSize(Ty).getFixedValue(),
-                         VF.isScalable()) != DL.getTypeStoreSize(VectorTy);
-  }
-
-  // If the vectorization factor is one, we just check if an array of type Ty
-  // requires padding between elements.
+/// element of the corresponding vector type.
+static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
+  // Determine if an array of N elements of type Ty is "bitcast compatible"
+  // with a <N x Ty> vector.
+  // This is only true if there is no padding between the array elements.
   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
 }
 
@@ -5212,7 +5204,7 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
   // requires padding and will be scalarized.
   auto &DL = I->getModule()->getDataLayout();
   auto *ScalarTy = getMemInstValueType(I);
-  if (hasIrregularType(ScalarTy, DL, VF))
+  if (hasIrregularType(ScalarTy, DL))
     return false;
 
   // Check if masking is required.
@@ -5259,7 +5251,7 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
   // requires padding and will be scalarized.
   auto &DL = I->getModule()->getDataLayout();
   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
-  if (hasIrregularType(ScalarTy, DL, VF))
+  if (hasIrregularType(ScalarTy, DL))
     return false;
 
   return true;
diff --git a/llvm/test/Transforms/LoopVectorize/irregular_type.ll b/llvm/test/Transforms/LoopVectorize/irregular_type.ll
new file mode 100644
index 000000000000..167a1a101e6f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/irregular_type.ll
@@ -0,0 +1,27 @@
+; RUN: opt %s -loop-vectorize -force-vector-width=4 -S | FileCheck %s
+
+; Ensure the array loads/stores are not optimized into vector operations when
+; the element type has padding bits.
+
+; CHECK: foo
+; CHECK: vector.body
+; CHECK-NOT: load <4 x i7>
+; CHECK-NOT: store <4 x i7>
+; CHECK: for.body
+define void @foo(i7* %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i7, i7* %a, i64 %indvars.iv
+  %0 = load i7, i7* %arrayidx, align 1
+  %sub = add nuw nsw i7 %0, 0
+  store i7 %sub, i7* %arrayidx, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %cmp = icmp eq i64 %indvars.iv.next, %n
+  br i1 %cmp, label %for.exit, label %for.body
+
+for.exit:
+  ret void
+}

From 04ba60cfe598e41084fb848daae47e0ed910fa7d Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Sun, 28 Mar 2021 16:30:47 -0700
Subject: [PATCH 204/318] [ORC][C-bindings] Fix some ORC C bindings function
 names and signatures.

LLVMOrcDisposeObjectLayer and LLVMOrcExecutionSessionGetJITDylibByName did not
have matching signatures between the C-API header and binding implementations.
Fixes http://llvm.org/PR49745.

Patch by Mats Larsen. Thanks Mats!

Reviewed by: lhames

Differential Revision: https://reviews.llvm.org/D99478

(cherry picked from commit 666df2e2cbe9fc252d3b2d6cbb214c2c2f6afc65)
---
 llvm/include/llvm-c/Orc.h                       | 7 ++++---
 llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm-c/Orc.h b/llvm/include/llvm-c/Orc.h
index 183107c148a6..9beef44c89dd 100644
--- a/llvm/include/llvm-c/Orc.h
+++ b/llvm/include/llvm-c/Orc.h
@@ -339,8 +339,7 @@ LLVMErrorRef LLVMOrcResourceTrackerRemove(LLVMOrcResourceTrackerRef RT);
  * ownership has not been passed to a JITDylib (e.g. because some error
  * prevented the client from calling LLVMOrcJITDylibAddGenerator).
  */
-void LLVMOrcDisposeDefinitionGenerator(
-    LLVMOrcDefinitionGeneratorRef DG);
+void LLVMOrcDisposeDefinitionGenerator(LLVMOrcDefinitionGeneratorRef DG);
 
 /**
  * Dispose of a MaterializationUnit.
@@ -388,7 +387,9 @@ LLVMOrcExecutionSessionCreateJITDylib(LLVMOrcExecutionSessionRef ES,
  * Returns the JITDylib with the given name, or NULL if no such JITDylib
  * exists.
  */
-LLVMOrcJITDylibRef LLVMOrcExecutionSessionGetJITDylibByName(const char *Name);
+LLVMOrcJITDylibRef
+LLVMOrcExecutionSessionGetJITDylibByName(LLVMOrcExecutionSessionRef ES,
+                                         const char *Name);
 
 /**
  * Return a reference to a newly created resource tracker associated with JD.
diff --git a/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp b/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp
index dfdd2c6c669f..834d4cc8f514 100644
--- a/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp
@@ -393,7 +393,7 @@ void LLVMOrcDisposeJITTargetMachineBuilder(
   delete unwrap(JTMB);
 }
 
-void lLVMOrcDisposeObjectLayer(LLVMOrcObjectLayerRef ObjLayer) {
+void LLVMOrcDisposeObjectLayer(LLVMOrcObjectLayerRef ObjLayer) {
   delete unwrap(ObjLayer);
 }
 

From d28af7c654d8db0b68c175db5ce212d74fb5e9bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lubo=C5=A1=20Lu=C5=88=C3=A1k?= <l.lunak@centrum.cz>
Date: Tue, 6 Apr 2021 18:38:18 +0200
Subject: [PATCH 205/318] remove -fpch-codegen and -fpch-debuginfo from Clang
 12.0 release notes

These were new in 11.0. The commit adding the options landed after
11.x branch had already been branched off from master, and only
then backported to 11.x, so the release notes change stayed for 12.0.
---
 clang/docs/ReleaseNotes.rst | 25 -------------------------
 1 file changed, 25 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index f3499d167361..4cc1b0b9d2cf 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -82,31 +82,6 @@ New Compiler Flags
   in that case. The option's behaviour mirrors GCC, the helpers are implemented
   both in compiler-rt and libgcc.
 
-- -fpch-codegen and -fpch-debuginfo generate shared code and/or debuginfo
-  for contents of a precompiled header in a separate object file. This object
-  file needs to be linked in, but its contents do not need to be generated
-  for other objects using the precompiled header. This should usually save
-  compile time. If not using clang-cl, the separate object file needs to
-  be created explicitly from the precompiled header.
-  Example of use:
-
-  .. code-block:: console
-
-    $ clang++ -x c++-header header.h -o header.pch -fpch-codegen -fpch-debuginfo
-    $ clang++ -c header.pch -o shared.o
-    $ clang++ -c source.cpp -o source.o -include-pch header.pch
-    $ clang++ -o binary source.o shared.o
-
-  - Using -fpch-instantiate-templates when generating the precompiled header
-    usually increases the amount of code/debuginfo that can be shared.
-  - In some cases, especially when building with optimizations enabled, using
-    -fpch-codegen may generate so much code in the shared object that compiling
-    it may be a net loss in build time.
-  - Since headers may bring in private symbols of other libraries, it may be
-    sometimes necessary to discard unused symbols (such as by adding
-    -Wl,--gc-sections on ELF platforms to the linking command, and possibly
-    adding -fdata-sections -ffunction-sections to the command generating
-    the shared object).
 - New option ``-fbinutils-version=`` specifies the targeted binutils version.
   For example, ``-fbinutils-version=2.35`` means compatibility with GNU as/ld
   before 2.35 is not needed: new features can be used and there is no need to

From fa0971b87fb2c9d14d1bba2551e61f02f18f329b Mon Sep 17 00:00:00 2001
From: Tim Northover <t.p.northover@gmail.com>
Date: Mon, 1 Feb 2021 12:43:33 +0000
Subject: [PATCH 206/318] GlobalISel: check type size before getZExtValue()ing
 it.

Otherwise getZExtValue() asserts.

(cherry picked from commit c2b322fc19e829162ed4c7dcd04d9e9b2cd4e66c)
---
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp  |  5 ++---
 .../CodeGen/AArch64/GlobalISel/huge-switch.ll | 22 +++++++++++++++++++
 2 files changed, 24 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/huge-switch.ll

diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index b97c369b832d..b7883cbc3120 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -840,9 +840,8 @@ void IRTranslator::emitSwitchCase(SwitchCG::CaseBlock &CB,
     // For conditional branch lowering, we might try to do something silly like
     // emit an G_ICMP to compare an existing G_ICMP i1 result with true. If so,
     // just re-use the existing condition vreg.
-    if (CI && CI->getZExtValue() == 1 &&
-        MRI->getType(CondLHS).getSizeInBits() == 1 &&
-        CB.PredInfo.Pred == CmpInst::ICMP_EQ) {
+    if (MRI->getType(CondLHS).getSizeInBits() == 1 && CI &&
+        CI->getZExtValue() == 1 && CB.PredInfo.Pred == CmpInst::ICMP_EQ) {
       Cond = CondLHS;
     } else {
       Register CondRHS = getOrCreateVReg(*CB.CmpRHS);
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/huge-switch.ll b/llvm/test/CodeGen/AArch64/GlobalISel/huge-switch.ll
new file mode 100644
index 000000000000..8742a848c4af
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/huge-switch.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mtriple=arm64-apple-ios %s -o - -O0 -global-isel=1 | FileCheck %s
+define void @foo(i512 %in) {
+; CHECK-LABEL: foo:
+; CHECK: cbz
+  switch i512 %in, label %default [
+    i512 3923188584616675477397368389504791510063972152790021570560, label %l1
+    i512 3923188584616675477397368389504791510063972152790021570561, label %l2
+    i512 3923188584616675477397368389504791510063972152790021570562, label %l3
+  ]
+
+default:
+  ret void
+
+l1:
+  ret void
+
+l2:
+  ret void
+
+l3:
+  ret void
+}

From 757752f568db698e3c0c35065c008489f2319a7b Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Thu, 22 Apr 2021 15:03:36 -0700
Subject: [PATCH 207/318] Bump version to 12.0.1

---
 libcxx/CMakeLists.txt                     | 2 +-
 libcxxabi/CMakeLists.txt                  | 2 +-
 libunwind/CMakeLists.txt                  | 2 +-
 llvm/CMakeLists.txt                       | 2 +-
 llvm/utils/gn/secondary/llvm/version.gni  | 2 +-
 llvm/utils/lit/lit/__init__.py            | 2 +-
 llvm/utils/release/build_llvm_package.bat | 4 ++--
 7 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index 9bf1a02f0908..cdd5495e36ab 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -29,7 +29,7 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR OR LIBCXX_STANDALONE_BUIL
   project(libcxx CXX C)
 
   set(PACKAGE_NAME libcxx)
-  set(PACKAGE_VERSION 12.0.0)
+  set(PACKAGE_VERSION 12.0.1)
   set(PACKAGE_STRING "${PACKAGE_NAME} ${PACKAGE_VERSION}")
   set(PACKAGE_BUGREPORT "llvm-bugs@lists.llvm.org")
 
diff --git a/libcxxabi/CMakeLists.txt b/libcxxabi/CMakeLists.txt
index 426c855288fc..6de2b5a2ed10 100644
--- a/libcxxabi/CMakeLists.txt
+++ b/libcxxabi/CMakeLists.txt
@@ -28,7 +28,7 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR OR LIBCXXABI_STANDALONE_B
   project(libcxxabi CXX C)
 
   set(PACKAGE_NAME libcxxabi)
-  set(PACKAGE_VERSION 11.0.0)
+  set(PACKAGE_VERSION 12.0.1)
   set(PACKAGE_STRING "${PACKAGE_NAME} ${PACKAGE_VERSION}")
   set(PACKAGE_BUGREPORT "llvm-bugs@lists.llvm.org")
 
diff --git a/libunwind/CMakeLists.txt b/libunwind/CMakeLists.txt
index 48cb8e004e08..570b8db90653 100644
--- a/libunwind/CMakeLists.txt
+++ b/libunwind/CMakeLists.txt
@@ -24,7 +24,7 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR OR LIBUNWIND_STANDALONE_B
   project(libunwind LANGUAGES C CXX ASM)
 
   set(PACKAGE_NAME libunwind)
-  set(PACKAGE_VERSION 12.0.0)
+  set(PACKAGE_VERSION 12.0.1)
   set(PACKAGE_STRING "${PACKAGE_NAME} ${PACKAGE_VERSION}")
   set(PACKAGE_BUGREPORT "llvm-bugs@lists.llvm.org")
 
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 277d0fe54d7b..28ccef34d8fc 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -11,7 +11,7 @@ if(NOT DEFINED LLVM_VERSION_MINOR)
   set(LLVM_VERSION_MINOR 0)
 endif()
 if(NOT DEFINED LLVM_VERSION_PATCH)
-  set(LLVM_VERSION_PATCH 0)
+  set(LLVM_VERSION_PATCH 1)
 endif()
 if(NOT DEFINED LLVM_VERSION_SUFFIX)
   set(LLVM_VERSION_SUFFIX "")
diff --git a/llvm/utils/gn/secondary/llvm/version.gni b/llvm/utils/gn/secondary/llvm/version.gni
index a66a92550a00..942974f7cf91 100644
--- a/llvm/utils/gn/secondary/llvm/version.gni
+++ b/llvm/utils/gn/secondary/llvm/version.gni
@@ -1,4 +1,4 @@
 llvm_version_major = 12
 llvm_version_minor = 0
-llvm_version_patch = 0
+llvm_version_patch = 1
 llvm_version = "$llvm_version_major.$llvm_version_minor.$llvm_version_patch"
diff --git a/llvm/utils/lit/lit/__init__.py b/llvm/utils/lit/lit/__init__.py
index 9c2aa512e179..c281391466b8 100644
--- a/llvm/utils/lit/lit/__init__.py
+++ b/llvm/utils/lit/lit/__init__.py
@@ -2,7 +2,7 @@
 
 __author__ = 'Daniel Dunbar'
 __email__ = 'daniel@minormatter.com'
-__versioninfo__ = (12, 0, 0)
+__versioninfo__ = (12, 0, 1)
 __version__ = '.'.join(str(v) for v in __versioninfo__) + 'dev'
 
 __all__ = []
diff --git a/llvm/utils/release/build_llvm_package.bat b/llvm/utils/release/build_llvm_package.bat
index 35dcd9f613c4..4747c70ca392 100755
--- a/llvm/utils/release/build_llvm_package.bat
+++ b/llvm/utils/release/build_llvm_package.bat
@@ -27,8 +27,8 @@ set python64_dir=C:\Users\%USERNAME%\AppData\Local\Programs\Python\Python36
 for /f "usebackq" %%i in (`PowerShell ^(Get-Date^).ToString^('yyyyMMdd'^)`) do set datestamp=%%i
 
 set revision=%1
-set package_version=12.0.0-%revision:~0,8%
-set clang_format_vs_version=12.0.0.%datestamp%
+set package_version=12.0.1-%revision:~0,8%
+set clang_format_vs_version=12.0.1.%datestamp%
 set build_dir=llvm_package_%revision:~0,8%
 
 echo Revision: %revision%

From eae7f3e3d45077a509a37bb2f2ff36b8196a855e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Thu, 18 Mar 2021 14:08:10 +0200
Subject: [PATCH 208/318] [lit] Pass the USERPROFILE variable through on
 Windows

When running in a Windows Container, the Git for Windows Unix tools
(C:\Program Files\Git\usr\bin) just hang if this variable isn't
passed through.

Currently, running the LLVM/clang tests in a Windows Container fails
if that directory is added to the path, but succeeds after this change.
(After this change, the previously used GnuWin tools can be left out
entirely, too, as lit automatically picks up the Git for Windows tools
if necessary.)

Differential Revision: https://reviews.llvm.org/D98858

(cherry picked from commit 9de63b2e051cb3e79645cc20b83b4d33d132cba0)
---
 llvm/utils/lit/lit/TestingConfig.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/lit/lit/TestingConfig.py b/llvm/utils/lit/lit/TestingConfig.py
index 38d05066a2b0..e6c1b937c27a 100644
--- a/llvm/utils/lit/lit/TestingConfig.py
+++ b/llvm/utils/lit/lit/TestingConfig.py
@@ -33,6 +33,7 @@ def fromdefaults(litConfig):
             pass_vars.append('INCLUDE')
             pass_vars.append('LIB')
             pass_vars.append('PATHEXT')
+            pass_vars.append('USERPROFILE')
             environment['PYTHONBUFFERED'] = '1'
 
         for var in pass_vars:

From 072c90a863aac1334a4950b3da262a025516dea0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Wed, 24 Mar 2021 23:58:54 +0200
Subject: [PATCH 209/318] [LLD] Fix probing a MSYS based 'tar' in a Windows
 Container

Don't run the 'tar' tool in a cleared environment with only the
LANG variable set, just set LANG on top of the existing environment.

If the 'tar' tool is an MSYS based tool, running it in a Windows
Container hangs if all environment variables are cleared - in
particular, the USERPROFILE variable needs to be kept intact.

This is the same issue fixed as was fixed in other places in
9de63b2e051cb3e79645cc20b83b4d33d132cba0, but contrary to running
the actual tests, running with an as-cleared-as-possible environment
here is less important.

Differential Revision: https://reviews.llvm.org/D99304

(cherry picked from commit a88556733a4dced22416bd3f45255128b9eb4f49)
---
 lld/test/lit.cfg.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lld/test/lit.cfg.py b/lld/test/lit.cfg.py
index 8e31fd3977f9..670f41f0b631 100644
--- a/lld/test/lit.cfg.py
+++ b/lld/test/lit.cfg.py
@@ -101,11 +101,13 @@
 
 tar_executable = lit.util.which('tar', config.environment['PATH'])
 if tar_executable:
+    env = os.environ
+    env['LANG'] = 'C'
     tar_version = subprocess.Popen(
         [tar_executable, '--version'],
         stdout=subprocess.PIPE,
         stderr=subprocess.PIPE,
-        env={'LANG': 'C'})
+        env=env)
     sout, _ = tar_version.communicate()
     if 'GNU tar' in sout.decode():
         config.available_features.add('gnutar')

From 25dd67ef882c327f9b6a3082cab6c33c9ff52d42 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Thu, 29 Apr 2021 23:26:25 -0700
Subject: [PATCH 210/318] [X86][AVX] foldShuffleOfHorizOp - don't attempt to
 handle 256-bit X86ISD::VBROADCAST (PR49971)

NOTE: This is for the 12.x release branch ONLY

Minimal patch to avoid the issue encountered in PR49971 (it's already been dealt with in trunk through a larger refactor that can't be easily merged).

Bail for non-128-bit vector broadcasts of (F)HADD/SUB ops - the existing logic doesn't correctly deal with the fact that the broadcast will splat across the 128-bit lanes.

Reviewed By: spatel, wristow

Differential Revision: https://reviews.llvm.org/D101104
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  2 ++
 llvm/test/CodeGen/X86/horizontal-shuffle-3.ll | 11 +++++++++++
 2 files changed, 13 insertions(+)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6b816c710f98..1e2407c7e7f6 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -37889,6 +37889,8 @@ static SDValue foldShuffleOfHorizOp(SDNode *N, SelectionDAG &DAG) {
   // replicating low and high halves (and without changing the type/length of
   // the vector), we don't need the shuffle.
   if (Opcode == X86ISD::MOVDDUP || Opcode == X86ISD::VBROADCAST) {
+    if (Opcode == X86ISD::VBROADCAST && !VT.is128BitVector())
+      return SDValue();
     if (HOp.getScalarValueSizeInBits() == 64 && HOp.getValueType() == VT) {
       // movddup (hadd X, X) --> hadd X, X
       // broadcast (extract_vec_elt (hadd X, X), 0) --> hadd X, X
diff --git a/llvm/test/CodeGen/X86/horizontal-shuffle-3.ll b/llvm/test/CodeGen/X86/horizontal-shuffle-3.ll
index 424ecf352e97..297070ad2bb6 100644
--- a/llvm/test/CodeGen/X86/horizontal-shuffle-3.ll
+++ b/llvm/test/CodeGen/X86/horizontal-shuffle-3.ll
@@ -98,6 +98,17 @@ define <8 x i32> @test_unpackhi_hsub_v8i32(<8 x i32> %0, <8 x i32> %1, <8 x i32>
   ret <8 x i32> %7
 }
 
+define <4 x double> @PR49971(<4 x double> %0) {
+; CHECK-LABEL: PR49971:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %2 = tail call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %0, <4 x double> %0)
+  %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  ret <4 x double> %3
+}
+
 declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>)
 declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>)
 declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>)

From 0cbbf06b625605fff83d89b17c2187c7ccfcecd5 Mon Sep 17 00:00:00 2001
From: Kadir Cetinkaya <kadircet@google.com>
Date: Thu, 8 Apr 2021 15:37:32 +0200
Subject: [PATCH 211/318] [clangd] Log a message when gRPC support is off, but
 remote-index is configured

Before this change clangd would emit a diagnostic whenever remote-index
was configured but binary didn't have grpc support.

This can be annoying when projects are configuring remote-index through their
configs but developers have a clangd binary without the support.

Differential Revision: https://reviews.llvm.org/D100103

(cherry picked from commit b9b708eef8cb7bcb073361283cd573beb04992a9)
---
 clang-tools-extra/clangd/ConfigCompile.cpp           |  5 +++--
 .../clangd/unittests/ConfigCompileTests.cpp          | 12 ++++++------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/clang-tools-extra/clangd/ConfigCompile.cpp b/clang-tools-extra/clangd/ConfigCompile.cpp
index dadc578c3a81..b4f0d6186886 100644
--- a/clang-tools-extra/clangd/ConfigCompile.cpp
+++ b/clang-tools-extra/clangd/ConfigCompile.cpp
@@ -321,8 +321,9 @@ struct FragmentCompiler {
                llvm::SMRange BlockRange) {
 #ifndef CLANGD_ENABLE_REMOTE
     if (External.Server) {
-      diag(Error, "Clangd isn't compiled with remote index support, ignoring "
-                  "Server." External.Server->Range);
+      elog("Clangd isn't compiled with remote index support, ignoring Server: "
+           "{0}",
+           *External.Server);
       External.Server.reset();
     }
 #endif
diff --git a/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp b/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp
index d9aa171f3102..4961d3474fd9 100644
--- a/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp
+++ b/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp
@@ -324,15 +324,15 @@ TEST_F(ConfigCompileTests, ExternalBlockWarnOnMultipleSource) {
   External.Server.emplace("");
   Frag.Index.External = std::move(External);
   compileAndApply();
-  llvm::StringLiteral ExpectedDiag =
 #ifdef CLANGD_ENABLE_REMOTE
-      "Exactly one of File or Server must be set.";
+  EXPECT_THAT(
+      Diags.Diagnostics,
+      Contains(AllOf(DiagMessage("Exactly one of File or Server must be set."),
+                     DiagKind(llvm::SourceMgr::DK_Error))));
 #else
-      "Clangd isn't compiled with remote index support, ignoring Server.";
+  ASSERT_TRUE(Conf.Index.External.hasValue());
+  EXPECT_EQ(Conf.Index.External->Kind, Config::ExternalIndexSpec::File);
 #endif
-  EXPECT_THAT(Diags.Diagnostics,
-              Contains(AllOf(DiagMessage(ExpectedDiag),
-                             DiagKind(llvm::SourceMgr::DK_Error))));
 }
 
 TEST_F(ConfigCompileTests, ExternalBlockErrOnNoSource) {

From 907a751a38fff8d05b288ab52b19ba4e2cc1fc38 Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Sun, 4 Apr 2021 23:15:29 +0300
Subject: [PATCH 212/318] [NFC][InstCombine] Add test for PR49778

(cherry picked from commit 5352490ce613f1bdedaf478765b089b1f0a8be0d)
---
 .../redundant-left-shift-input-masking-pr49778.ll | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-pr49778.ll

diff --git a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-pr49778.ll b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-pr49778.ll
new file mode 100644
index 000000000000..4865afa56a03
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-pr49778.ll
@@ -0,0 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; PR49778: this should not be folded to 0.
+define i32 @src(i1 %x2) {
+; CHECK-LABEL: @src(
+; CHECK-NEXT:    ret i32 0
+;
+  %x13 = zext i1 %x2 to i32
+  %_7 = shl i32 4294967295, %x13
+  %mask = xor i32 %_7, 4294967295
+  %_8 = and i32 %mask, %x13
+  %_9 = shl i32 %_8, %x13
+  ret i32 %_9
+}

From 4a4b1c75a1ea3f1ca90ef45470c42debb81ffc90 Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Sun, 4 Apr 2021 23:23:10 +0300
Subject: [PATCH 213/318] [NFC][InstCombine] Extract
 canTryToConstantAddTwoShiftAmounts() as helper

(cherry picked from commit dceb3e599668496420d41b993100d23eeb7c0ada)
---
 .../InstCombine/InstCombineShifts.cpp         | 46 +++++++++++--------
 1 file changed, 26 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 7295369365c4..52f064e17820 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -21,6 +21,30 @@ using namespace PatternMatch;
 
 #define DEBUG_TYPE "instcombine"
 
+bool canTryToConstantAddTwoShiftAmounts(Value *Sh0, Value *ShAmt0, Value *Sh1,
+                                        Value *ShAmt1) {
+  // We have two shift amounts from two different shifts. The types of those
+  // shift amounts may not match. If that's the case let's bailout now..
+  if (ShAmt0->getType() != ShAmt1->getType())
+    return false;
+
+  // As input, we have the following pattern:
+  //   Sh0 (Sh1 X, Q), K
+  // We want to rewrite that as:
+  //   Sh x, (Q+K)  iff (Q+K) u< bitwidth(x)
+  // While we know that originally (Q+K) would not overflow
+  // (because  2 * (N-1) u<= iN -1), we have looked past extensions of
+  // shift amounts. so it may now overflow in smaller bitwidth.
+  // To ensure that does not happen, we need to ensure that the total maximal
+  // shift amount is still representable in that smaller bit width.
+  unsigned MaximalPossibleTotalShiftAmount =
+      (Sh0->getType()->getScalarSizeInBits() - 1) +
+      (Sh1->getType()->getScalarSizeInBits() - 1);
+  APInt MaximalRepresentableShiftAmount =
+      APInt::getAllOnesValue(ShAmt0->getType()->getScalarSizeInBits());
+  return MaximalRepresentableShiftAmount.uge(MaximalPossibleTotalShiftAmount);
+}
+
 // Given pattern:
 //   (x shiftopcode Q) shiftopcode K
 // we should rewrite it as
@@ -57,26 +81,8 @@ Value *InstCombinerImpl::reassociateShiftAmtsOfTwoSameDirectionShifts(
   if (!match(Sh1, m_Shift(m_Value(X), m_ZExtOrSelf(m_Value(ShAmt1)))))
     return nullptr;
 
-  // We have two shift amounts from two different shifts. The types of those
-  // shift amounts may not match. If that's the case let's bailout now..
-  if (ShAmt0->getType() != ShAmt1->getType())
-    return nullptr;
-
-  // As input, we have the following pattern:
-  //   Sh0 (Sh1 X, Q), K
-  // We want to rewrite that as:
-  //   Sh x, (Q+K)  iff (Q+K) u< bitwidth(x)
-  // While we know that originally (Q+K) would not overflow
-  // (because  2 * (N-1) u<= iN -1), we have looked past extensions of
-  // shift amounts. so it may now overflow in smaller bitwidth.
-  // To ensure that does not happen, we need to ensure that the total maximal
-  // shift amount is still representable in that smaller bit width.
-  unsigned MaximalPossibleTotalShiftAmount =
-      (Sh0->getType()->getScalarSizeInBits() - 1) +
-      (Sh1->getType()->getScalarSizeInBits() - 1);
-  APInt MaximalRepresentableShiftAmount =
-      APInt::getAllOnesValue(ShAmt0->getType()->getScalarSizeInBits());
-  if (MaximalRepresentableShiftAmount.ult(MaximalPossibleTotalShiftAmount))
+  // Verify that it would be safe to try to add those two shift amounts.
+  if (!canTryToConstantAddTwoShiftAmounts(Sh0, ShAmt0, Sh1, ShAmt1))
     return nullptr;
 
   // We are only looking for signbit extraction if we have two right shifts.

From c27ad80507bfea35da07681fd4ec9972ca698015 Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Sun, 4 Apr 2021 23:25:29 +0300
Subject: [PATCH 214/318] [InstCombine] dropRedundantMaskingOfLeftShiftInput():
 check that adding shift amounts doesn't overflow (PR49778)

This is identical to 781d077afb0ed9771c513d064c40170c1ccd21c9,
but for the other function.

For certain shift amount bit widths, we must first ensure that adding
shift amounts is safe, that the sum won't have an unsigned overflow.

Fixes https://bugs.llvm.org/show_bug.cgi?id=49778

(cherry picked from commit 2760a808b9916a2839513b7fd7314a464f52481e)
---
 .../lib/Transforms/InstCombine/InstCombineShifts.cpp | 12 ++++++------
 .../redundant-left-shift-input-masking-pr49778.ll    |  7 ++++++-
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 52f064e17820..127bf8080959 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -226,9 +226,9 @@ dropRedundantMaskingOfLeftShiftInput(BinaryOperator *OuterShift,
     // Peek through an optional zext of the shift amount.
     match(MaskShAmt, m_ZExtOrSelf(m_Value(MaskShAmt)));
 
-    // We have two shift amounts from two different shifts. The types of those
-    // shift amounts may not match. If that's the case let's bailout now.
-    if (MaskShAmt->getType() != ShiftShAmt->getType())
+    // Verify that it would be safe to try to add those two shift amounts.
+    if (!canTryToConstantAddTwoShiftAmounts(OuterShift, ShiftShAmt, Masked,
+                                            MaskShAmt))
       return nullptr;
 
     // Can we simplify (MaskShAmt+ShiftShAmt) ?
@@ -258,9 +258,9 @@ dropRedundantMaskingOfLeftShiftInput(BinaryOperator *OuterShift,
     // Peek through an optional zext of the shift amount.
     match(MaskShAmt, m_ZExtOrSelf(m_Value(MaskShAmt)));
 
-    // We have two shift amounts from two different shifts. The types of those
-    // shift amounts may not match. If that's the case let's bailout now.
-    if (MaskShAmt->getType() != ShiftShAmt->getType())
+    // Verify that it would be safe to try to add those two shift amounts.
+    if (!canTryToConstantAddTwoShiftAmounts(OuterShift, ShiftShAmt, Masked,
+                                            MaskShAmt))
       return nullptr;
 
     // Can we simplify (ShiftShAmt-MaskShAmt) ?
diff --git a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-pr49778.ll b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-pr49778.ll
index 4865afa56a03..8d70733ed03d 100644
--- a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-pr49778.ll
+++ b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-pr49778.ll
@@ -4,7 +4,12 @@
 ; PR49778: this should not be folded to 0.
 define i32 @src(i1 %x2) {
 ; CHECK-LABEL: @src(
-; CHECK-NEXT:    ret i32 0
+; CHECK-NEXT:    [[X13:%.*]] = zext i1 [[X2:%.*]] to i32
+; CHECK-NEXT:    [[_7:%.*]] = shl i32 -1, [[X13]]
+; CHECK-NEXT:    [[MASK:%.*]] = xor i32 [[_7]], -1
+; CHECK-NEXT:    [[_8:%.*]] = and i32 [[MASK]], [[X13]]
+; CHECK-NEXT:    [[_9:%.*]] = shl i32 [[_8]], [[X13]]
+; CHECK-NEXT:    ret i32 [[_9]]
 ;
   %x13 = zext i1 %x2 to i32
   %_7 = shl i32 4294967295, %x13

From 3568d61f11e2eb0017c7b65707bee7bf4111c8ca Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 10 Feb 2021 12:17:37 -0800
Subject: [PATCH 215/318] BPF: Implement TTI.IntImmCost() properly

This patch implemented TTI.IntImmCost() properly.
Each BPF insn has 32bit immediate space, so for any immediate
which can be represented as 32bit signed int, the cost
is technically free. If an int cannot be presented as
a 32bit signed int, a ld_imm64 instruction is needed
and a TCC_Basic is returned.

This change is motivated when we observed that
several bpf selftests failed with latest llvm trunk, e.g.,
  #10/16 strobemeta.o:FAIL
  #10/17 strobemeta_nounroll1.o:FAIL
  #10/18 strobemeta_nounroll2.o:FAIL
  #10/19 strobemeta_subprogs.o:FAIL
  #96 snprintf_btf:FAIL

The reason of the failure is due to that
SpeculateAroundPHIsPass did aggressive transformation
which alters control flow for which currently verifer
cannot handle well. In llvm12, SpeculateAroundPHIsPass
is not called.

SpeculateAroundPHIsPass relied on TTI.getIntImmCost()
and TTI.getIntImmCostInst() for profitability
analysis. This patch implemented TTI.getIntImmCost()
properly for BPF backend which also prevented
transformation which caused the above test failures.

Differential Revision: https://reviews.llvm.org/D96448

(cherry picked from commit a260ae716030d5d2644a2af649501277d326bb21)
---
 llvm/lib/Target/BPF/BPFTargetMachine.cpp     |  6 +++
 llvm/lib/Target/BPF/BPFTargetMachine.h       |  2 +
 llvm/lib/Target/BPF/BPFTargetTransformInfo.h | 49 ++++++++++++++++++++
 3 files changed, 57 insertions(+)
 create mode 100644 llvm/lib/Target/BPF/BPFTargetTransformInfo.h

diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
index c0244b9f2c74..a8fef2517b03 100644
--- a/llvm/lib/Target/BPF/BPFTargetMachine.cpp
+++ b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
@@ -12,6 +12,7 @@
 
 #include "BPFTargetMachine.h"
 #include "BPF.h"
+#include "BPFTargetTransformInfo.h"
 #include "MCTargetDesc/BPFMCAsmInfo.h"
 #include "TargetInfo/BPFTargetInfo.h"
 #include "llvm/CodeGen/Passes.h"
@@ -145,6 +146,11 @@ void BPFPassConfig::addIRPasses() {
   TargetPassConfig::addIRPasses();
 }
 
+TargetTransformInfo
+BPFTargetMachine::getTargetTransformInfo(const Function &F) {
+  return TargetTransformInfo(BPFTTIImpl(this, F));
+}
+
 // Install an instruction selector pass using
 // the ISelDag to gen BPF code.
 bool BPFPassConfig::addInstSelector() {
diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.h b/llvm/lib/Target/BPF/BPFTargetMachine.h
index 5243a15eb7b0..61c8a44cc402 100644
--- a/llvm/lib/Target/BPF/BPFTargetMachine.h
+++ b/llvm/lib/Target/BPF/BPFTargetMachine.h
@@ -34,6 +34,8 @@ class BPFTargetMachine : public LLVMTargetMachine {
 
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
+  TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
diff --git a/llvm/lib/Target/BPF/BPFTargetTransformInfo.h b/llvm/lib/Target/BPF/BPFTargetTransformInfo.h
new file mode 100644
index 000000000000..622da9a0a3f7
--- /dev/null
+++ b/llvm/lib/Target/BPF/BPFTargetTransformInfo.h
@@ -0,0 +1,49 @@
+//===------ BPFTargetTransformInfo.h - BPF specific TTI ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file uses the target's specific information to
+// provide more precise answers to certain TTI queries, while letting the
+// target independent and default TTI implementations handle the rest.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPFTARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_BPF_BPFTARGETTRANSFORMINFO_H
+
+#include "BPFTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+
+namespace llvm {
+class BPFTTIImpl : public BasicTTIImplBase<BPFTTIImpl> {
+  typedef BasicTTIImplBase<BPFTTIImpl> BaseT;
+  typedef TargetTransformInfo TTI;
+  friend BaseT;
+
+  const BPFSubtarget *ST;
+  const BPFTargetLowering *TLI;
+
+  const BPFSubtarget *getST() const { return ST; }
+  const BPFTargetLowering *getTLI() const { return TLI; }
+
+public:
+  explicit BPFTTIImpl(const BPFTargetMachine *TM, const Function &F)
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+        TLI(ST->getTargetLowering()) {}
+
+  int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) {
+    if (Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
+      return TTI::TCC_Free;
+
+    return TTI::TCC_Basic;
+  }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_BPF_BPFTARGETTRANSFORMINFO_H

From f9efff398c1159b15964b166368b232f562e6cfc Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 11 Feb 2021 10:00:32 -0800
Subject: [PATCH 216/318] BPF: Add LLVMAnalysis in CMakefile LINK_COMPONENTS

buildbot reported a build error like below:
  BPFTargetMachine.cpp:(.text._ZN4llvm19TargetTransformInfo5ModelINS_10BPFTTIImplEED2Ev
    [_ZN4llvm19TargetTransformInfo5ModelINS_10BPFTTIImplEED2Ev]+0x14):
    undefined reference to `llvm::TargetTransformInfo::Concept::~Concept()'
  lib/Target/BPF/CMakeFiles/LLVMBPFCodeGen.dir/BPFTargetMachine.cpp.o:
    In function `llvm::TargetTransformInfo::Model<llvm::BPFTTIImpl>::~Model()':

Commit a260ae716030 ("BPF: Implement TTI.IntImmCost() properly")
added TargetTransformInfo to BPF, which requires LLVMAnalysis
dependence. In certain cmake configurations, lacking explicit
LLVMAnalysis dependency may cause compilation error.
Similar to other targets, this patch added LLVMAnalysis
in CMakefile LINK_COMPONENTS explicitly.

(cherry picked from commit 74975d35b47631da0c7911561f16d3ffd1af142a)
---
 llvm/lib/Target/BPF/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/Target/BPF/CMakeLists.txt b/llvm/lib/Target/BPF/CMakeLists.txt
index 24c6c5e1255e..189a3a84c3df 100644
--- a/llvm/lib/Target/BPF/CMakeLists.txt
+++ b/llvm/lib/Target/BPF/CMakeLists.txt
@@ -35,6 +35,7 @@ add_llvm_target(BPFCodeGen
   BTFDebug.cpp
 
   LINK_COMPONENTS
+  Analysis
   AsmPrinter
   CodeGen
   Core

From 2460947eefc2176693a4aa4d05cd9733e38c7ffe Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 24 Feb 2021 10:02:48 -0800
Subject: [PATCH 217/318] BPF: Implement TTI.getCmpSelInstrCost() properly

The Select insn in BPF is expensive as BPF backend
needs to resolve with conditionals.  This patch set
the getCmpSelInstrCost() to SCEVCheapExpansionBudget
for Select insn to prevent some Select insn related
optimizations.

This change is motivated during bcc code review for
   https://github.com/iovisor/bcc/pull/3270
where IndVarSimplifyPass eventually caused generating
the following asm code:
  ;       for (i = 0; (i < VIRTIO_MAX_SGS) && (i < num); i++) {
      14:       16 05 40 00 00 00 00 00 if w5 == 0 goto +64 <LBB0_6>
      15:       bc 51 00 00 00 00 00 00 w1 = w5
      16:       04 01 00 00 ff ff ff ff w1 += -1
      17:       67 05 00 00 20 00 00 00 r5 <<= 32
      18:       77 05 00 00 20 00 00 00 r5 >>= 32
      19:       a6 01 01 00 05 00 00 00 if w1 < 5 goto +1 <LBB0_4>
      20:       b7 05 00 00 06 00 00 00 r5 = 6
  00000000000000a8 <LBB0_4>:
      21:       b7 02 00 00 00 00 00 00 r2 = 0
      22:       b7 01 00 00 00 00 00 00 r1 = 0
  ;       for (i = 0; (i < VIRTIO_MAX_SGS) && (i < num); i++) {
      23:       7b 1a e0 ff 00 00 00 00 *(u64 *)(r10 - 32) = r1
      24:       7b 5a c0 ff 00 00 00 00 *(u64 *)(r10 - 64) = r5
Note that insn #15 has w1 = w5 and w1 is refined later but r5(w5) is
eventually saved on stack at insn #24 for later use. This cause
later verifier failures.

With this change, IndVarSimplifyPass won't do the above
transformation any more.

Differential Revision: https://reviews.llvm.org/D97479

(cherry picked from commit 1959ead525b8830cc8a345f45e1c3ef9902d3229)
---
 llvm/lib/Target/BPF/BPFTargetTransformInfo.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/llvm/lib/Target/BPF/BPFTargetTransformInfo.h b/llvm/lib/Target/BPF/BPFTargetTransformInfo.h
index 622da9a0a3f7..62055497e685 100644
--- a/llvm/lib/Target/BPF/BPFTargetTransformInfo.h
+++ b/llvm/lib/Target/BPF/BPFTargetTransformInfo.h
@@ -18,6 +18,7 @@
 #include "BPFTargetMachine.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 
 namespace llvm {
 class BPFTTIImpl : public BasicTTIImplBase<BPFTTIImpl> {
@@ -42,6 +43,17 @@ class BPFTTIImpl : public BasicTTIImplBase<BPFTTIImpl> {
 
     return TTI::TCC_Basic;
   }
+
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         CmpInst::Predicate VecPred,
+                         TTI::TargetCostKind CostKind,
+                         const llvm::Instruction *I = nullptr) {
+    if (Opcode == Instruction::Select)
+      return SCEVCheapExpansionBudget;
+
+    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
+                                     I);
+  }
 };
 
 } // end namespace llvm

From 6fe7c3728d1e98e05c67ceb03f429cb04a30e151 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 25 Feb 2021 15:34:48 -0800
Subject: [PATCH 218/318] BPF: Add LLVMTransformUtils in CMakefile
 LINK_COMPONENTS

Commit 1959ead525b8 ("BPF: Implement TTI.getCmpSelInstrCost()
properly") introduced a dependency on LLVMTransformUtils
library. Let us encode this dependency explicitly in
CMakefile to avoid build error.

(cherry picked from commit 6d102f15a3af0a44cf2e26677e260bee425312f3)
---
 llvm/lib/Target/BPF/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/Target/BPF/CMakeLists.txt b/llvm/lib/Target/BPF/CMakeLists.txt
index 189a3a84c3df..2d804ca8a73e 100644
--- a/llvm/lib/Target/BPF/CMakeLists.txt
+++ b/llvm/lib/Target/BPF/CMakeLists.txt
@@ -47,6 +47,7 @@ add_llvm_target(BPFCodeGen
   SelectionDAG
   Support
   Target
+  TransformUtils
 
   ADD_TO_COMPONENT
   BPF

From b8e4d4eafeded48f3c07797a2d8ccc950394085e Mon Sep 17 00:00:00 2001
From: Michael Kruse <llvm-project@meinersbur.de>
Date: Wed, 21 Apr 2021 00:09:53 -0500
Subject: [PATCH 219/318] [PollyACC] Fix implicit function definitions. NFC.

The isl_id_* have been in used without including the correspodning
isl/id.h header. According to rules in C, a function is defined
implicitly when first used with an assumed int return type (32 bits on
64 bit systems). But the implementation returns a pointer (64 bits on 64
bit systems). Is usually has no consequence because the return value is
stored in a registers that is 64 bits (RAX) and the optimizer does not
truncate its value before using it again as a pointer value. However,
LTO optimizers will be rightfull;y confused.

Fix by including <isl/id.h>

This fixes llvm.org/PR50021

(cherry picked from commit 90e5ce0b0d6b0e72fdc034cbb612f67d67de0fdd)
---
 polly/lib/External/ppcg/print.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/polly/lib/External/ppcg/print.c b/polly/lib/External/ppcg/print.c
index 79aaf2b00d23..dd839e48e51b 100644
--- a/polly/lib/External/ppcg/print.c
+++ b/polly/lib/External/ppcg/print.c
@@ -9,6 +9,7 @@
 
 #include <isl/aff.h>
 #include <isl/ast_build.h>
+#include <isl/id.h>
 
 #include "print.h"
 #include "util.h"

From 8b2c019ace3c2e04108b550c5a2b60fc1c63865f Mon Sep 17 00:00:00 2001
From: Michael Kruse <llvm-project@meinersbur.de>
Date: Wed, 21 Apr 2021 10:12:23 -0500
Subject: [PATCH 220/318] [PollyACC] Fix declaration/stub definition mismatch.
 NFC.

external.c defines stub functions that are never used because of how
Polly uses PPCG. Unfortunately, they are declared as functions without
return values or parameters which does not match their declarations.
Since they are never called, this was usually not a problem, but an LTO
build gets confused with differently declared functions, or in case of
pet_options_args, a global variable declaration that is defined as a
function

Resolve by including the declaring headers in external.c which forces
the declaration and definition to match at compile-time.

This fixes llvm.org/50021

(cherry picked from commit 89b59345ee29d2cc1afa1f60445916ae2e74be6d)
---
 polly/lib/External/ppcg/external.c | 167 +++++++++++++++--------------
 1 file changed, 89 insertions(+), 78 deletions(-)

diff --git a/polly/lib/External/ppcg/external.c b/polly/lib/External/ppcg/external.c
index 3a63ffbd97e5..c5ef6320e64f 100644
--- a/polly/lib/External/ppcg/external.c
+++ b/polly/lib/External/ppcg/external.c
@@ -1,181 +1,192 @@
-#include "assert.h"
-#include "stdio.h"
-#include "stdlib.h"
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <pet.h>
+#include "cpu.h"
+#include "opencl.h"
+
 
 #define die() { \
   fprintf(stderr, "Dummy function %s called\n", __FUNCTION__); \
   abort(); \
 }
 
-void pet_scop_compute_outer_to_any(){
+__isl_give isl_union_map *pet_scop_compute_outer_to_any(
+  __isl_keep pet_scop *scop) {
   die();
 }
-void pet_scop_compute_outer_to_inner(){
+__isl_give isl_union_map *pet_scop_compute_outer_to_inner(
+  __isl_keep pet_scop *scop) {
   die();
 }
-void pet_tree_get_type(){
+enum pet_tree_type pet_tree_get_type(__isl_keep pet_tree *tree) {
   die();
 }
-void pet_tree_foreach_access_expr(){
+int pet_tree_foreach_access_expr(__isl_keep pet_tree *tree,
+  int (*fn)(__isl_keep pet_expr *expr, void *user), void *user) {
   die();
 }
-void pet_expr_get_ctx(){
+isl_ctx *pet_expr_get_ctx(__isl_keep pet_expr *expr) {
   die();
 }
-void pet_expr_access_is_read(){
+isl_bool pet_expr_access_is_read(__isl_keep pet_expr *expr) {
   die();
 }
-void pet_expr_access_is_write(){
+isl_bool pet_expr_access_is_write(__isl_keep pet_expr *expr) {
   die();
 }
-void pet_expr_access_get_tagged_may_read(){
+__isl_give isl_union_map *pet_expr_access_get_tagged_may_read(
+  __isl_keep pet_expr *expr) {
   die();
 }
-void pet_expr_access_get_tagged_may_write(){
+__isl_give isl_union_map *pet_expr_access_get_tagged_may_write(
+  __isl_keep pet_expr *expr) {
   die();
 }
-void pet_expr_access_get_must_write(){
+__isl_give isl_union_map *pet_expr_access_get_must_write(
+  __isl_keep pet_expr *expr) {
   die();
 }
-void pet_expr_access_get_index(){
+__isl_give isl_multi_pw_aff *pet_expr_access_get_index(
+  __isl_keep pet_expr *expr) {
   die();
 }
-void pet_expr_access_get_ref_id(){
+__isl_give isl_id *pet_expr_access_get_ref_id(__isl_keep pet_expr *expr) {
   die();
 }
-void print_cpu(){
+__isl_give isl_printer *print_cpu(__isl_take isl_printer *p,
+  struct ppcg_scop *ps, struct ppcg_options *options) {
   die();
 }
 
-void pet_stmt_print_body(){
-  die();
-}
-void pet_loc_get_start(){
-  die();
-}
-void pet_loc_get_end(){
-  die();
-}
-void pet_scop_collect_tagged_may_reads(){
-  die();
-}
-void pet_scop_collect_may_reads(){
+__isl_give isl_printer *pet_stmt_print_body(struct pet_stmt *stmt,
+  __isl_take isl_printer *p, __isl_keep isl_id_to_ast_expr *ref2expr) {
   die();
 }
-void pet_scop_collect_tagged_may_writes(){
+unsigned pet_loc_get_start(__isl_keep pet_loc *loc) {
   die();
 }
-void pet_scop_collect_may_writes(){
+unsigned pet_loc_get_end(__isl_keep pet_loc *loc) {
   die();
 }
-void pet_scop_collect_tagged_must_writes(){
+int pet_transform_C_source(isl_ctx *ctx, const char *input, FILE *output,
+  __isl_give isl_printer *(*transform)(__isl_take isl_printer *p,
+    __isl_take pet_scop *scop, void *user), void *user) {
   die();
 }
-void pet_scop_collect_must_writes(){
+__isl_give isl_printer *pet_scop_print_original(__isl_keep pet_scop *scop,
+  __isl_take isl_printer *p) {
   die();
 }
-void pet_scop_collect_tagged_must_kills(){
+__isl_null pet_scop *pet_scop_free(__isl_take pet_scop *scop) {
   die();
 }
-void pet_transform_C_source(){
+__isl_give pet_scop *pet_scop_align_params(__isl_take pet_scop *scop) {
   die();
 }
-void pet_scop_print_original(){
+int pet_scop_can_build_ast_exprs(__isl_keep pet_scop *scop) {
   die();
 }
-void pet_scop_free(){
+int pet_scop_has_data_dependent_conditions(__isl_keep pet_scop *scop) {
   die();
 }
-void pet_scop_align_params(){
+int pet_tree_foreach_expr(__isl_keep pet_tree *tree,
+  int (*fn)(__isl_keep pet_expr *expr, void *user), void *user) {
   die();
 }
-void pet_scop_can_build_ast_exprs(){
+int pet_expr_foreach_call_expr(__isl_keep pet_expr *expr,
+  int (*fn)(__isl_keep pet_expr *expr, void *user), void *user) {
   die();
 }
-void pet_scop_has_data_dependent_conditions(){
+int pet_stmt_is_kill(struct pet_stmt *stmt) {
   die();
 }
-void pet_tree_foreach_expr(){
+struct isl_args pet_options_args;
+const char *ppcg_version(void) {
   die();
 }
-void pet_expr_foreach_call_expr(){
+int pet_options_set_encapsulate_dynamic_control(isl_ctx *ctx, int val) {
   die();
 }
-void pet_stmt_is_kill(){
+int generate_opencl(isl_ctx *ctx, struct ppcg_options *options,
+  const char *input, const char *output) {
   die();
 }
-void pet_options_args() {
+int generate_cpu(isl_ctx *ctx, struct ppcg_options *options,
+  const char *input, const char *output) {
   die();
 }
-void ppcg_print_guarded() {
+__isl_give isl_id_to_ast_expr *pet_stmt_build_ast_exprs(struct pet_stmt *stmt,
+  __isl_keep isl_ast_build *build,
+  __isl_give isl_multi_pw_aff *(*fn_index)(
+    __isl_take isl_multi_pw_aff *mpa, __isl_keep isl_id *id,
+    void *user), void *user_index,
+  __isl_give isl_ast_expr *(*fn_expr)(__isl_take isl_ast_expr *expr,
+    __isl_keep isl_id *id, void *user), void *user_expr) {
   die();
 }
-void ppcg_version() {
+__isl_give isl_union_map *pet_scop_get_tagged_may_reads(
+  __isl_keep pet_scop *scop) {
   die();
 }
-void pet_options_set_encapsulate_dynamic_control() {
+__isl_give isl_union_map *pet_scop_get_may_reads(__isl_keep pet_scop *scop) {
   die();
 }
-void generate_opencl() {
+__isl_give isl_union_map *pet_scop_get_may_writes(__isl_keep pet_scop *scop) {
   die();
 }
-void generate_cpu() {
+__isl_give isl_union_map *pet_scop_get_must_writes(__isl_keep pet_scop *scop) {
   die();
 }
-void pet_stmt_build_ast_exprs() {
+__isl_give isl_union_map *pet_scop_get_tagged_may_writes(
+  __isl_keep pet_scop *scop) {
   die();
 }
- void pet_scop_get_tagged_may_reads() {
+__isl_give isl_union_map *pet_scop_get_tagged_must_writes(
+  __isl_keep pet_scop *scop) {
   die();
 }
- void pet_scop_get_may_reads() {
-  die();
-}
-void pet_scop_get_may_writes() {
-  die();
-}
-void pet_scop_get_must_writes() {
-  die();
-}
-void pet_scop_get_tagged_may_writes() {
-  die();
-}
-void pet_scop_get_tagged_must_writes() {
-die();
-}
-void pet_scop_get_must_kills() {
+__isl_give isl_union_map *pet_scop_get_must_kills(__isl_keep pet_scop *scop) {
   die();
 }
-void pet_scop_get_tagged_must_kills() {
+__isl_give isl_union_map *pet_scop_get_tagged_must_kills(
+  __isl_keep pet_scop *scop) {
   die();
 }
-void pet_expr_call_get_name() {
+__isl_keep const char *pet_expr_call_get_name(__isl_keep pet_expr *expr) {
   die();
 }
-void pet_expr_call_set_name() {
+__isl_give pet_expr *pet_expr_call_set_name(__isl_take pet_expr *expr,
+  __isl_keep const char *name) {
   die();
 }
-void pet_expr_get_arg() {
+__isl_give pet_expr *pet_expr_get_arg(__isl_keep pet_expr *expr, int pos) {
   die();
 }
-void pet_expr_new_cast() {
+__isl_give pet_expr *pet_expr_new_cast(const char *type_name,
+  __isl_take pet_expr *arg) {
   die();
 }
-void pet_expr_set_arg() {
+__isl_give pet_expr *pet_expr_set_arg(__isl_take pet_expr *expr, int pos,
+  __isl_take pet_expr *arg) {
   die();
 }
-void pet_tree_copy() {
+__isl_give pet_tree *pet_tree_copy(__isl_keep pet_tree *tree) {
   die();
 }
-void pet_tree_free() {
+__isl_null pet_tree *pet_tree_free(__isl_take pet_tree *tree) {
   die();
 }
-void pet_tree_map_call_expr() {
+__isl_give pet_tree *pet_tree_map_call_expr(__isl_take pet_tree *tree,
+  __isl_give pet_expr *(*fn)(__isl_take pet_expr *expr, void *user),
+  void *user) {
   die();
 }
-void pet_expr_access_get_may_read() {
+__isl_give isl_union_map *pet_expr_access_get_may_read(
+  __isl_keep pet_expr *expr) {
   die();
 }
-void pet_expr_access_get_may_write() {
+__isl_give isl_union_map *pet_expr_access_get_may_write(
+  __isl_keep pet_expr *expr) {
   die();
 }

From 3263c81589eca689341ab5084723bdb7fe4a1286 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Thu, 11 Feb 2021 22:28:19 +0000
Subject: [PATCH 221/318] Partially Revert "scan-view: Remove Reporter.py and
 associated AppleScript files"

This reverts some of commit dbb01536f6f49fa428f170e34466072ef439b3e9.

The Reporter module was still being used by the ScanView.py module and deleting
it caused scan-view to fail.  This commit adds back Reporter.py but removes the
code the references the AppleScript files which were removed in
dbb01536f6f49fa428f170e34466072ef439b3e9.

Reviewed By: NoQ

Differential Revision: https://reviews.llvm.org/D96367

(cherry picked from commit e3cd3a3c91524c957e06bb0170343548f02b6842)
---
 clang/tools/scan-view/CMakeLists.txt    |   1 +
 clang/tools/scan-view/share/Reporter.py | 183 ++++++++++++++++++++++++
 2 files changed, 184 insertions(+)
 create mode 100644 clang/tools/scan-view/share/Reporter.py

diff --git a/clang/tools/scan-view/CMakeLists.txt b/clang/tools/scan-view/CMakeLists.txt
index dd3d33439299..eccc6b83195b 100644
--- a/clang/tools/scan-view/CMakeLists.txt
+++ b/clang/tools/scan-view/CMakeLists.txt
@@ -5,6 +5,7 @@ set(BinFiles
 
 set(ShareFiles
       ScanView.py
+      Reporter.py
       startfile.py
       bugcatcher.ico)
 
diff --git a/clang/tools/scan-view/share/Reporter.py b/clang/tools/scan-view/share/Reporter.py
new file mode 100644
index 000000000000..31a14fb0cf74
--- /dev/null
+++ b/clang/tools/scan-view/share/Reporter.py
@@ -0,0 +1,183 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""Methods for reporting bugs."""
+
+import subprocess, sys, os
+
+__all__ = ['ReportFailure', 'BugReport', 'getReporters']
+
+#
+
+class ReportFailure(Exception):
+    """Generic exception for failures in bug reporting."""
+    def __init__(self, value):        
+        self.value = value
+
+# Collect information about a bug.
+
+class BugReport(object):
+    def __init__(self, title, description, files):
+        self.title = title
+        self.description = description
+        self.files = files
+
+# Reporter interfaces.
+
+import os
+
+import email, mimetypes, smtplib
+from email import encoders
+from email.message import Message
+from email.mime.base import MIMEBase
+from email.mime.multipart import MIMEMultipart
+from email.mime.text import MIMEText
+
+#===------------------------------------------------------------------------===#
+# ReporterParameter
+#===------------------------------------------------------------------------===#
+
+class ReporterParameter(object):
+  def __init__(self, n):
+    self.name = n
+  def getName(self):
+    return self.name
+  def getValue(self,r,bugtype,getConfigOption):
+     return getConfigOption(r.getName(),self.getName())
+  def saveConfigValue(self):
+    return True
+
+class TextParameter (ReporterParameter):
+  def getHTML(self,r,bugtype,getConfigOption):
+    return """\
+<tr>
+<td class="form_clabel">%s:</td>
+<td class="form_value"><input type="text" name="%s_%s" value="%s"></td>
+</tr>"""%(self.getName(),r.getName(),self.getName(),self.getValue(r,bugtype,getConfigOption))
+
+class SelectionParameter (ReporterParameter):
+  def __init__(self, n, values):
+    ReporterParameter.__init__(self,n)
+    self.values = values
+    
+  def getHTML(self,r,bugtype,getConfigOption):
+    default = self.getValue(r,bugtype,getConfigOption)
+    return """\
+<tr>
+<td class="form_clabel">%s:</td><td class="form_value"><select name="%s_%s">
+%s
+</select></td>"""%(self.getName(),r.getName(),self.getName(),'\n'.join(["""\
+<option value="%s"%s>%s</option>"""%(o[0],
+                                     o[0] == default and ' selected="selected"' or '',
+                                     o[1]) for o in self.values]))
+
+#===------------------------------------------------------------------------===#
+# Reporters
+#===------------------------------------------------------------------------===#
+
+class EmailReporter(object):
+    def getName(self):
+        return 'Email'
+
+    def getParameters(self):
+        return [TextParameter(x) for x in ['To', 'From', 'SMTP Server', 'SMTP Port']]
+
+    # Lifted from python email module examples.
+    def attachFile(self, outer, path):
+        # Guess the content type based on the file's extension.  Encoding
+        # will be ignored, although we should check for simple things like
+        # gzip'd or compressed files.
+        ctype, encoding = mimetypes.guess_type(path)
+        if ctype is None or encoding is not None:
+            # No guess could be made, or the file is encoded (compressed), so
+            # use a generic bag-of-bits type.
+            ctype = 'application/octet-stream'
+        maintype, subtype = ctype.split('/', 1)
+        if maintype == 'text':
+            fp = open(path)
+            # Note: we should handle calculating the charset
+            msg = MIMEText(fp.read(), _subtype=subtype)
+            fp.close()
+        else:
+            fp = open(path, 'rb')
+            msg = MIMEBase(maintype, subtype)
+            msg.set_payload(fp.read())
+            fp.close()
+            # Encode the payload using Base64
+            encoders.encode_base64(msg)
+        # Set the filename parameter
+        msg.add_header('Content-Disposition', 'attachment', filename=os.path.basename(path))
+        outer.attach(msg)
+
+    def fileReport(self, report, parameters):
+        mainMsg = """\
+BUG REPORT
+---
+Title: %s
+Description: %s
+"""%(report.title, report.description)
+
+        if not parameters.get('To'):
+            raise ReportFailure('No "To" address specified.')
+        if not parameters.get('From'):
+            raise ReportFailure('No "From" address specified.')
+
+        msg = MIMEMultipart()
+        msg['Subject'] = 'BUG REPORT: %s'%(report.title)
+        # FIXME: Get config parameters
+        msg['To'] = parameters.get('To')
+        msg['From'] = parameters.get('From')
+        msg.preamble = mainMsg
+
+        msg.attach(MIMEText(mainMsg, _subtype='text/plain'))
+        for file in report.files:
+            self.attachFile(msg, file)
+
+        try:
+            s = smtplib.SMTP(host=parameters.get('SMTP Server'),
+                             port=parameters.get('SMTP Port'))
+            s.sendmail(msg['From'], msg['To'], msg.as_string())
+            s.close()
+        except:
+            raise ReportFailure('Unable to send message via SMTP.')
+
+        return "Message sent!"
+
+class BugzillaReporter(object):
+    def getName(self):
+        return 'Bugzilla'
+    
+    def getParameters(self):
+        return [TextParameter(x) for x in ['URL','Product']]
+
+    def fileReport(self, report, parameters):
+        raise NotImplementedError
+ 
+
+class RadarClassificationParameter(SelectionParameter):
+  def __init__(self):
+    SelectionParameter.__init__(self,"Classification",
+            [['1', 'Security'], ['2', 'Crash/Hang/Data Loss'],
+             ['3', 'Performance'], ['4', 'UI/Usability'], 
+             ['6', 'Serious Bug'], ['7', 'Other']])
+
+  def saveConfigValue(self):
+    return False
+    
+  def getValue(self,r,bugtype,getConfigOption):
+    if bugtype.find("leak") != -1:
+      return '3'
+    elif bugtype.find("dereference") != -1:
+      return '2'
+    elif bugtype.find("missing ivar release") != -1:
+      return '3'
+    else:
+      return '7'
+
+###
+
+def getReporters():
+    reporters = []
+    reporters.append(EmailReporter())
+    return reporters
+

From c1831fc655979a0501a792f730d84d68e15a888e Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung@sifive.com>
Date: Wed, 17 Mar 2021 19:08:46 -0700
Subject: [PATCH 222/318] [RISCV] Fix isel pattern of masked vmslt[u]

This patch changes the operand order of masked vmslt[u]
from (mask, rs1, scalar, maskedoff, vl)
to (maskedoff, rs1, scalar, mask, vl).

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D98839

(cherry picked from commit fca5d63aa8d43a21557874d9bc040e944ab0500d)
---
 .../Target/RISCV/RISCVInstrInfoVPseudos.td    | 20 +++---
 llvm/test/CodeGen/RISCV/rvv/vmslt-rv32.ll     | 60 ++++++++++++----
 llvm/test/CodeGen/RISCV/rvv/vmslt-rv64.ll     | 72 ++++++++++++++-----
 llvm/test/CodeGen/RISCV/rvv/vmsltu-rv32.ll    | 60 ++++++++++++----
 llvm/test/CodeGen/RISCV/rvv/vmsltu-rv64.ll    | 72 ++++++++++++++-----
 5 files changed, 208 insertions(+), 76 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 60bd1b24cab8..5c228820f0cc 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -3909,10 +3909,10 @@ foreach vti = AllIntegerVectors in {
                                                                (DecImm simm5_plus1:$rs2),
                                                                GPR:$vl,
                                                                vti.SEW)>;
-  def : Pat<(vti.Mask (int_riscv_vmslt_mask (vti.Mask V0),
+  def : Pat<(vti.Mask (int_riscv_vmslt_mask (vti.Mask VR:$merge),
                                             (vti.Vector vti.RegClass:$rs1),
                                             (vti.Scalar simm5_plus1:$rs2),
-                                            (vti.Mask VR:$merge),
+                                            (vti.Mask V0),
                                             (XLenVT (VLOp GPR:$vl)))),
             (!cast<Instruction>("PseudoVMSLE_VI_"#vti.LMul.MX#"_MASK")
                                                       VR:$merge,
@@ -3922,17 +3922,17 @@ foreach vti = AllIntegerVectors in {
                                                       GPR:$vl,
                                                       vti.SEW)>;
 
- def : Pat<(vti.Mask (int_riscv_vmsltu (vti.Vector vti.RegClass:$rs1),
+  def : Pat<(vti.Mask (int_riscv_vmsltu (vti.Vector vti.RegClass:$rs1),
                                         (vti.Scalar simm5_plus1:$rs2),
                                         (XLenVT (VLOp GPR:$vl)))),
             (!cast<Instruction>("PseudoVMSLEU_VI_"#vti.LMul.MX) vti.RegClass:$rs1,
                                                                 (DecImm simm5_plus1:$rs2),
                                                                 GPR:$vl,
                                                                 vti.SEW)>;
-  def : Pat<(vti.Mask (int_riscv_vmsltu_mask (vti.Mask V0),
+  def : Pat<(vti.Mask (int_riscv_vmsltu_mask (vti.Mask VR:$merge),
                                              (vti.Vector vti.RegClass:$rs1),
                                              (vti.Scalar simm5_plus1:$rs2),
-                                             (vti.Mask VR:$merge),
+                                             (vti.Mask V0),
                                              (XLenVT (VLOp GPR:$vl)))),
             (!cast<Instruction>("PseudoVMSLEU_VI_"#vti.LMul.MX#"_MASK")
                                                       VR:$merge,
@@ -3950,11 +3950,11 @@ foreach vti = AllIntegerVectors in {
                                                                vti.RegClass:$rs1,
                                                                GPR:$vl,
                                                                vti.SEW)>;
-  def : Pat<(vti.Mask (int_riscv_vmsltu_mask (vti.Mask V0),
-                                            (vti.Vector vti.RegClass:$rs1),
-                                            (vti.Scalar 0),
-                                            (vti.Mask VR:$merge),
-                                            (XLenVT (VLOp GPR:$vl)))),
+  def : Pat<(vti.Mask (int_riscv_vmsltu_mask (vti.Mask VR:$merge),
+                                             (vti.Vector vti.RegClass:$rs1),
+                                             (vti.Scalar 0),
+                                             (vti.Mask V0),
+                                             (XLenVT (VLOp GPR:$vl)))),
             (!cast<Instruction>("PseudoVMSNE_VV_"#vti.LMul.MX#"_MASK")
                                                      VR:$merge,
                                                      vti.RegClass:$rs1,
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmslt-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vmslt-rv32.ll
index 894a232a167d..a51949573c97 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmslt-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmslt-rv32.ll
@@ -1504,9 +1504,11 @@ entry:
 define <vscale x 1 x i1> @intrinsic_vmslt_mask_vi_nxv1i8_i8(<vscale x 1 x i1> %0, <vscale x 1 x i8> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vi_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e8,mf8,tu,mu
-; CHECK-NEXT:    vmsle.vi v9, v8, -15, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsle.vi v25, v8, -15, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 1 x i1> @llvm.riscv.vmslt.mask.nxv1i8.i8(
@@ -1537,9 +1539,11 @@ entry:
 define <vscale x 2 x i1> @intrinsic_vmslt_mask_vi_nxv2i8_i8(<vscale x 2 x i1> %0, <vscale x 2 x i8> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vi_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e8,mf4,tu,mu
-; CHECK-NEXT:    vmsle.vi v9, v8, -13, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsle.vi v25, v8, -13, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 2 x i1> @llvm.riscv.vmslt.mask.nxv2i8.i8(
@@ -1570,9 +1574,11 @@ entry:
 define <vscale x 4 x i1> @intrinsic_vmslt_mask_vi_nxv4i8_i8(<vscale x 4 x i1> %0, <vscale x 4 x i8> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vi_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e8,mf2,tu,mu
-; CHECK-NEXT:    vmsle.vi v9, v8, -11, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsle.vi v25, v8, -11, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 4 x i1> @llvm.riscv.vmslt.mask.nxv4i8.i8(
@@ -1603,9 +1609,11 @@ entry:
 define <vscale x 8 x i1> @intrinsic_vmslt_mask_vi_nxv8i8_i8(<vscale x 8 x i1> %0, <vscale x 8 x i8> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vi_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e8,m1,tu,mu
-; CHECK-NEXT:    vmsle.vi v9, v8, -9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsle.vi v25, v8, -9, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 8 x i1> @llvm.riscv.vmslt.mask.nxv8i8.i8(
@@ -1636,9 +1644,11 @@ entry:
 define <vscale x 16 x i1> @intrinsic_vmslt_mask_vi_nxv16i8_i8(<vscale x 16 x i1> %0, <vscale x 16 x i8> %1, <vscale x 16 x i1> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vi_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e8,m2,tu,mu
-; CHECK-NEXT:    vmsle.vi v10, v8, -7, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmsle.vi v25, v8, -7, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 16 x i1> @llvm.riscv.vmslt.mask.nxv16i8.i8(
@@ -1669,9 +1679,11 @@ entry:
 define <vscale x 32 x i1> @intrinsic_vmslt_mask_vi_nxv32i8_i8(<vscale x 32 x i1> %0, <vscale x 32 x i8> %1, <vscale x 32 x i1> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vi_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e8,m4,tu,mu
-; CHECK-NEXT:    vmsle.vi v12, v8, -5, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmsle.vi v25, v8, -5, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 32 x i1> @llvm.riscv.vmslt.mask.nxv32i8.i8(
@@ -1702,9 +1714,11 @@ entry:
 define <vscale x 1 x i1> @intrinsic_vmslt_mask_vi_nxv1i16_i16(<vscale x 1 x i1> %0, <vscale x 1 x i16> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vi_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e16,mf4,tu,mu
-; CHECK-NEXT:    vmsle.vi v9, v8, -3, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsle.vi v25, v8, -3, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 1 x i1> @llvm.riscv.vmslt.mask.nxv1i16.i16(
@@ -1735,9 +1749,11 @@ entry:
 define <vscale x 2 x i1> @intrinsic_vmslt_mask_vi_nxv2i16_i16(<vscale x 2 x i1> %0, <vscale x 2 x i16> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vi_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e16,mf2,tu,mu
-; CHECK-NEXT:    vmsle.vi v9, v8, -1, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsle.vi v25, v8, -1, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 2 x i1> @llvm.riscv.vmslt.mask.nxv2i16.i16(
@@ -1768,9 +1784,11 @@ entry:
 define <vscale x 4 x i1> @intrinsic_vmslt_mask_vi_nxv4i16_i16(<vscale x 4 x i1> %0, <vscale x 4 x i16> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vi_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e16,m1,tu,mu
-; CHECK-NEXT:    vmsle.vi v9, v8, 0, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsle.vi v25, v8, 0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 4 x i1> @llvm.riscv.vmslt.mask.nxv4i16.i16(
@@ -1801,9 +1819,11 @@ entry:
 define <vscale x 8 x i1> @intrinsic_vmslt_mask_vi_nxv8i16_i16(<vscale x 8 x i1> %0, <vscale x 8 x i16> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vi_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e16,m2,tu,mu
-; CHECK-NEXT:    vmsle.vi v10, v8, 2, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmsle.vi v25, v8, 2, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 8 x i1> @llvm.riscv.vmslt.mask.nxv8i16.i16(
@@ -1834,9 +1854,11 @@ entry:
 define <vscale x 16 x i1> @intrinsic_vmslt_mask_vi_nxv16i16_i16(<vscale x 16 x i1> %0, <vscale x 16 x i16> %1, <vscale x 16 x i1> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vi_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e16,m4,tu,mu
-; CHECK-NEXT:    vmsle.vi v12, v8, 4, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmsle.vi v25, v8, 4, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 16 x i1> @llvm.riscv.vmslt.mask.nxv16i16.i16(
@@ -1867,9 +1889,11 @@ entry:
 define <vscale x 1 x i1> @intrinsic_vmslt_mask_vi_nxv1i32_i32(<vscale x 1 x i1> %0, <vscale x 1 x i32> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vi_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e32,mf2,tu,mu
-; CHECK-NEXT:    vmsle.vi v9, v8, 6, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsle.vi v25, v8, 6, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 1 x i1> @llvm.riscv.vmslt.mask.nxv1i32.i32(
@@ -1900,9 +1924,11 @@ entry:
 define <vscale x 2 x i1> @intrinsic_vmslt_mask_vi_nxv2i32_i32(<vscale x 2 x i1> %0, <vscale x 2 x i32> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vi_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e32,m1,tu,mu
-; CHECK-NEXT:    vmsle.vi v9, v8, 8, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsle.vi v25, v8, 8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 2 x i1> @llvm.riscv.vmslt.mask.nxv2i32.i32(
@@ -1933,9 +1959,11 @@ entry:
 define <vscale x 4 x i1> @intrinsic_vmslt_mask_vi_nxv4i32_i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vi_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e32,m2,tu,mu
-; CHECK-NEXT:    vmsle.vi v10, v8, 10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmsle.vi v25, v8, 10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 4 x i1> @llvm.riscv.vmslt.mask.nxv4i32.i32(
@@ -1966,9 +1994,11 @@ entry:
 define <vscale x 8 x i1> @intrinsic_vmslt_mask_vi_nxv8i32_i32(<vscale x 8 x i1> %0, <vscale x 8 x i32> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vi_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e32,m4,tu,mu
-; CHECK-NEXT:    vmsle.vi v12, v8, 12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmsle.vi v25, v8, 12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 8 x i1> @llvm.riscv.vmslt.mask.nxv8i32.i32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmslt-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vmslt-rv64.ll
index 7802c2a84ff8..c75f37c440d5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmslt-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmslt-rv64.ll
@@ -1801,9 +1801,11 @@ entry:
 define <vscale x 1 x i1> @intrinsic_vmslt_mask_vi_nxv1i8_i8(<vscale x 1 x i1> %0, <vscale x 1 x i8> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vi_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e8,mf8,tu,mu
-; CHECK-NEXT:    vmsle.vi v9, v8, -15, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsle.vi v25, v8, -15, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 1 x i1> @llvm.riscv.vmslt.mask.nxv1i8.i8(
@@ -1834,9 +1836,11 @@ entry:
 define <vscale x 2 x i1> @intrinsic_vmslt_mask_vi_nxv2i8_i8(<vscale x 2 x i1> %0, <vscale x 2 x i8> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vi_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e8,mf4,tu,mu
-; CHECK-NEXT:    vmsle.vi v9, v8, -13, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsle.vi v25, v8, -13, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 2 x i1> @llvm.riscv.vmslt.mask.nxv2i8.i8(
@@ -1867,9 +1871,11 @@ entry:
 define <vscale x 4 x i1> @intrinsic_vmslt_mask_vi_nxv4i8_i8(<vscale x 4 x i1> %0, <vscale x 4 x i8> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vi_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e8,mf2,tu,mu
-; CHECK-NEXT:    vmsle.vi v9, v8, -11, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsle.vi v25, v8, -11, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 4 x i1> @llvm.riscv.vmslt.mask.nxv4i8.i8(
@@ -1900,9 +1906,11 @@ entry:
 define <vscale x 8 x i1> @intrinsic_vmslt_mask_vi_nxv8i8_i8(<vscale x 8 x i1> %0, <vscale x 8 x i8> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vi_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e8,m1,tu,mu
-; CHECK-NEXT:    vmsle.vi v9, v8, -9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsle.vi v25, v8, -9, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 8 x i1> @llvm.riscv.vmslt.mask.nxv8i8.i8(
@@ -1933,9 +1941,11 @@ entry:
 define <vscale x 16 x i1> @intrinsic_vmslt_mask_vi_nxv16i8_i8(<vscale x 16 x i1> %0, <vscale x 16 x i8> %1, <vscale x 16 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vi_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e8,m2,tu,mu
-; CHECK-NEXT:    vmsle.vi v10, v8, -7, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmsle.vi v25, v8, -7, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 16 x i1> @llvm.riscv.vmslt.mask.nxv16i8.i8(
@@ -1966,9 +1976,11 @@ entry:
 define <vscale x 32 x i1> @intrinsic_vmslt_mask_vi_nxv32i8_i8(<vscale x 32 x i1> %0, <vscale x 32 x i8> %1, <vscale x 32 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vi_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e8,m4,tu,mu
-; CHECK-NEXT:    vmsle.vi v12, v8, -5, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmsle.vi v25, v8, -5, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 32 x i1> @llvm.riscv.vmslt.mask.nxv32i8.i8(
@@ -1999,9 +2011,11 @@ entry:
 define <vscale x 1 x i1> @intrinsic_vmslt_mask_vi_nxv1i16_i16(<vscale x 1 x i1> %0, <vscale x 1 x i16> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vi_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e16,mf4,tu,mu
-; CHECK-NEXT:    vmsle.vi v9, v8, -3, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsle.vi v25, v8, -3, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 1 x i1> @llvm.riscv.vmslt.mask.nxv1i16.i16(
@@ -2032,9 +2046,11 @@ entry:
 define <vscale x 2 x i1> @intrinsic_vmslt_mask_vi_nxv2i16_i16(<vscale x 2 x i1> %0, <vscale x 2 x i16> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vi_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e16,mf2,tu,mu
-; CHECK-NEXT:    vmsle.vi v9, v8, -1, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsle.vi v25, v8, -1, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 2 x i1> @llvm.riscv.vmslt.mask.nxv2i16.i16(
@@ -2065,9 +2081,11 @@ entry:
 define <vscale x 4 x i1> @intrinsic_vmslt_mask_vi_nxv4i16_i16(<vscale x 4 x i1> %0, <vscale x 4 x i16> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vi_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e16,m1,tu,mu
-; CHECK-NEXT:    vmsle.vi v9, v8, 0, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsle.vi v25, v8, 0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 4 x i1> @llvm.riscv.vmslt.mask.nxv4i16.i16(
@@ -2098,9 +2116,11 @@ entry:
 define <vscale x 8 x i1> @intrinsic_vmslt_mask_vi_nxv8i16_i16(<vscale x 8 x i1> %0, <vscale x 8 x i16> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vi_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e16,m2,tu,mu
-; CHECK-NEXT:    vmsle.vi v10, v8, 2, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmsle.vi v25, v8, 2, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 8 x i1> @llvm.riscv.vmslt.mask.nxv8i16.i16(
@@ -2131,9 +2151,11 @@ entry:
 define <vscale x 16 x i1> @intrinsic_vmslt_mask_vi_nxv16i16_i16(<vscale x 16 x i1> %0, <vscale x 16 x i16> %1, <vscale x 16 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vi_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e16,m4,tu,mu
-; CHECK-NEXT:    vmsle.vi v12, v8, 4, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmsle.vi v25, v8, 4, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 16 x i1> @llvm.riscv.vmslt.mask.nxv16i16.i16(
@@ -2164,9 +2186,11 @@ entry:
 define <vscale x 1 x i1> @intrinsic_vmslt_mask_vi_nxv1i32_i32(<vscale x 1 x i1> %0, <vscale x 1 x i32> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vi_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e32,mf2,tu,mu
-; CHECK-NEXT:    vmsle.vi v9, v8, 6, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsle.vi v25, v8, 6, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 1 x i1> @llvm.riscv.vmslt.mask.nxv1i32.i32(
@@ -2197,9 +2221,11 @@ entry:
 define <vscale x 2 x i1> @intrinsic_vmslt_mask_vi_nxv2i32_i32(<vscale x 2 x i1> %0, <vscale x 2 x i32> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vi_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e32,m1,tu,mu
-; CHECK-NEXT:    vmsle.vi v9, v8, 8, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsle.vi v25, v8, 8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 2 x i1> @llvm.riscv.vmslt.mask.nxv2i32.i32(
@@ -2230,9 +2256,11 @@ entry:
 define <vscale x 4 x i1> @intrinsic_vmslt_mask_vi_nxv4i32_i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vi_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e32,m2,tu,mu
-; CHECK-NEXT:    vmsle.vi v10, v8, 10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmsle.vi v25, v8, 10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 4 x i1> @llvm.riscv.vmslt.mask.nxv4i32.i32(
@@ -2263,9 +2291,11 @@ entry:
 define <vscale x 8 x i1> @intrinsic_vmslt_mask_vi_nxv8i32_i32(<vscale x 8 x i1> %0, <vscale x 8 x i32> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vi_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e32,m4,tu,mu
-; CHECK-NEXT:    vmsle.vi v12, v8, 12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmsle.vi v25, v8, 12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 8 x i1> @llvm.riscv.vmslt.mask.nxv8i32.i32(
@@ -2296,9 +2326,11 @@ entry:
 define <vscale x 1 x i1> @intrinsic_vmslt_mask_vi_nxv1i64_i64(<vscale x 1 x i1> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vi_nxv1i64_i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e64,m1,tu,mu
-; CHECK-NEXT:    vmsle.vi v9, v8, 14, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsle.vi v25, v8, 14, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 1 x i1> @llvm.riscv.vmslt.mask.nxv1i64.i64(
@@ -2329,9 +2361,11 @@ entry:
 define <vscale x 2 x i1> @intrinsic_vmslt_mask_vi_nxv2i64_i64(<vscale x 2 x i1> %0, <vscale x 2 x i64> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vi_nxv2i64_i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e64,m2,tu,mu
-; CHECK-NEXT:    vmsle.vi v10, v8, -16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmsle.vi v25, v8, -16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 2 x i1> @llvm.riscv.vmslt.mask.nxv2i64.i64(
@@ -2362,9 +2396,11 @@ entry:
 define <vscale x 4 x i1> @intrinsic_vmslt_mask_vi_nxv4i64_i64(<vscale x 4 x i1> %0, <vscale x 4 x i64> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmslt_mask_vi_nxv4i64_i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e64,m4,tu,mu
-; CHECK-NEXT:    vmsle.vi v12, v8, -14, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmsle.vi v25, v8, -14, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 4 x i1> @llvm.riscv.vmslt.mask.nxv4i64.i64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsltu-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vmsltu-rv32.ll
index 8cd17ead0234..ca78411b24c2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsltu-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsltu-rv32.ll
@@ -1504,9 +1504,11 @@ entry:
 define <vscale x 1 x i1> @intrinsic_vmsltu_mask_vi_nxv1i8_i8(<vscale x 1 x i1> %0, <vscale x 1 x i8> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vi_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e8,mf8,tu,mu
-; CHECK-NEXT:    vmsleu.vi v9, v8, -15, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsleu.vi v25, v8, -15, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 1 x i1> @llvm.riscv.vmsltu.mask.nxv1i8.i8(
@@ -1537,9 +1539,11 @@ entry:
 define <vscale x 2 x i1> @intrinsic_vmsltu_mask_vi_nxv2i8_i8(<vscale x 2 x i1> %0, <vscale x 2 x i8> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vi_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e8,mf4,tu,mu
-; CHECK-NEXT:    vmsleu.vi v9, v8, -13, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsleu.vi v25, v8, -13, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 2 x i1> @llvm.riscv.vmsltu.mask.nxv2i8.i8(
@@ -1570,9 +1574,11 @@ entry:
 define <vscale x 4 x i1> @intrinsic_vmsltu_mask_vi_nxv4i8_i8(<vscale x 4 x i1> %0, <vscale x 4 x i8> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vi_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e8,mf2,tu,mu
-; CHECK-NEXT:    vmsleu.vi v9, v8, -11, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsleu.vi v25, v8, -11, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 4 x i1> @llvm.riscv.vmsltu.mask.nxv4i8.i8(
@@ -1603,9 +1609,11 @@ entry:
 define <vscale x 8 x i1> @intrinsic_vmsltu_mask_vi_nxv8i8_i8(<vscale x 8 x i1> %0, <vscale x 8 x i8> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vi_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e8,m1,tu,mu
-; CHECK-NEXT:    vmsleu.vi v9, v8, -9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsleu.vi v25, v8, -9, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 8 x i1> @llvm.riscv.vmsltu.mask.nxv8i8.i8(
@@ -1636,9 +1644,11 @@ entry:
 define <vscale x 16 x i1> @intrinsic_vmsltu_mask_vi_nxv16i8_i8(<vscale x 16 x i1> %0, <vscale x 16 x i8> %1, <vscale x 16 x i1> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vi_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e8,m2,tu,mu
-; CHECK-NEXT:    vmsleu.vi v10, v8, -7, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmsleu.vi v25, v8, -7, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 16 x i1> @llvm.riscv.vmsltu.mask.nxv16i8.i8(
@@ -1669,9 +1679,11 @@ entry:
 define <vscale x 32 x i1> @intrinsic_vmsltu_mask_vi_nxv32i8_i8(<vscale x 32 x i1> %0, <vscale x 32 x i8> %1, <vscale x 32 x i1> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vi_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e8,m4,tu,mu
-; CHECK-NEXT:    vmsleu.vi v12, v8, -5, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmsleu.vi v25, v8, -5, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 32 x i1> @llvm.riscv.vmsltu.mask.nxv32i8.i8(
@@ -1702,9 +1714,11 @@ entry:
 define <vscale x 1 x i1> @intrinsic_vmsltu_mask_vi_nxv1i16_i16(<vscale x 1 x i1> %0, <vscale x 1 x i16> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vi_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e16,mf4,tu,mu
-; CHECK-NEXT:    vmsleu.vi v9, v8, -3, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsleu.vi v25, v8, -3, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 1 x i1> @llvm.riscv.vmsltu.mask.nxv1i16.i16(
@@ -1735,9 +1749,11 @@ entry:
 define <vscale x 2 x i1> @intrinsic_vmsltu_mask_vi_nxv2i16_i16(<vscale x 2 x i1> %0, <vscale x 2 x i16> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vi_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e16,mf2,tu,mu
-; CHECK-NEXT:    vmsne.vv v9, v8, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsne.vv v25, v8, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 2 x i1> @llvm.riscv.vmsltu.mask.nxv2i16.i16(
@@ -1768,9 +1784,11 @@ entry:
 define <vscale x 4 x i1> @intrinsic_vmsltu_mask_vi_nxv4i16_i16(<vscale x 4 x i1> %0, <vscale x 4 x i16> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vi_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e16,m1,tu,mu
-; CHECK-NEXT:    vmsleu.vi v9, v8, 0, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsleu.vi v25, v8, 0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 4 x i1> @llvm.riscv.vmsltu.mask.nxv4i16.i16(
@@ -1801,9 +1819,11 @@ entry:
 define <vscale x 8 x i1> @intrinsic_vmsltu_mask_vi_nxv8i16_i16(<vscale x 8 x i1> %0, <vscale x 8 x i16> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vi_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e16,m2,tu,mu
-; CHECK-NEXT:    vmsleu.vi v10, v8, 2, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmsleu.vi v25, v8, 2, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 8 x i1> @llvm.riscv.vmsltu.mask.nxv8i16.i16(
@@ -1834,9 +1854,11 @@ entry:
 define <vscale x 16 x i1> @intrinsic_vmsltu_mask_vi_nxv16i16_i16(<vscale x 16 x i1> %0, <vscale x 16 x i16> %1, <vscale x 16 x i1> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vi_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e16,m4,tu,mu
-; CHECK-NEXT:    vmsleu.vi v12, v8, 4, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmsleu.vi v25, v8, 4, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 16 x i1> @llvm.riscv.vmsltu.mask.nxv16i16.i16(
@@ -1867,9 +1889,11 @@ entry:
 define <vscale x 1 x i1> @intrinsic_vmsltu_mask_vi_nxv1i32_i32(<vscale x 1 x i1> %0, <vscale x 1 x i32> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vi_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e32,mf2,tu,mu
-; CHECK-NEXT:    vmsleu.vi v9, v8, 6, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsleu.vi v25, v8, 6, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 1 x i1> @llvm.riscv.vmsltu.mask.nxv1i32.i32(
@@ -1900,9 +1924,11 @@ entry:
 define <vscale x 2 x i1> @intrinsic_vmsltu_mask_vi_nxv2i32_i32(<vscale x 2 x i1> %0, <vscale x 2 x i32> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vi_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e32,m1,tu,mu
-; CHECK-NEXT:    vmsleu.vi v9, v8, 8, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsleu.vi v25, v8, 8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 2 x i1> @llvm.riscv.vmsltu.mask.nxv2i32.i32(
@@ -1933,9 +1959,11 @@ entry:
 define <vscale x 4 x i1> @intrinsic_vmsltu_mask_vi_nxv4i32_i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vi_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e32,m2,tu,mu
-; CHECK-NEXT:    vmsleu.vi v10, v8, 10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmsleu.vi v25, v8, 10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 4 x i1> @llvm.riscv.vmsltu.mask.nxv4i32.i32(
@@ -1966,9 +1994,11 @@ entry:
 define <vscale x 8 x i1> @intrinsic_vmsltu_mask_vi_nxv8i32_i32(<vscale x 8 x i1> %0, <vscale x 8 x i32> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vi_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e32,m4,tu,mu
-; CHECK-NEXT:    vmsleu.vi v12, v8, 12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmsleu.vi v25, v8, 12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 8 x i1> @llvm.riscv.vmsltu.mask.nxv8i32.i32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsltu-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vmsltu-rv64.ll
index 83dc531750ee..a145dd684bac 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmsltu-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmsltu-rv64.ll
@@ -1801,9 +1801,11 @@ entry:
 define <vscale x 1 x i1> @intrinsic_vmsltu_mask_vi_nxv1i8_i8(<vscale x 1 x i1> %0, <vscale x 1 x i8> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vi_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e8,mf8,tu,mu
-; CHECK-NEXT:    vmsleu.vi v9, v8, -15, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsleu.vi v25, v8, -15, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 1 x i1> @llvm.riscv.vmsltu.mask.nxv1i8.i8(
@@ -1834,9 +1836,11 @@ entry:
 define <vscale x 2 x i1> @intrinsic_vmsltu_mask_vi_nxv2i8_i8(<vscale x 2 x i1> %0, <vscale x 2 x i8> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vi_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e8,mf4,tu,mu
-; CHECK-NEXT:    vmsleu.vi v9, v8, -13, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsleu.vi v25, v8, -13, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 2 x i1> @llvm.riscv.vmsltu.mask.nxv2i8.i8(
@@ -1867,9 +1871,11 @@ entry:
 define <vscale x 4 x i1> @intrinsic_vmsltu_mask_vi_nxv4i8_i8(<vscale x 4 x i1> %0, <vscale x 4 x i8> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vi_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e8,mf2,tu,mu
-; CHECK-NEXT:    vmsleu.vi v9, v8, -11, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsleu.vi v25, v8, -11, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 4 x i1> @llvm.riscv.vmsltu.mask.nxv4i8.i8(
@@ -1900,9 +1906,11 @@ entry:
 define <vscale x 8 x i1> @intrinsic_vmsltu_mask_vi_nxv8i8_i8(<vscale x 8 x i1> %0, <vscale x 8 x i8> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vi_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e8,m1,tu,mu
-; CHECK-NEXT:    vmsleu.vi v9, v8, -9, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsleu.vi v25, v8, -9, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 8 x i1> @llvm.riscv.vmsltu.mask.nxv8i8.i8(
@@ -1933,9 +1941,11 @@ entry:
 define <vscale x 16 x i1> @intrinsic_vmsltu_mask_vi_nxv16i8_i8(<vscale x 16 x i1> %0, <vscale x 16 x i8> %1, <vscale x 16 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vi_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e8,m2,tu,mu
-; CHECK-NEXT:    vmsleu.vi v10, v8, -7, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmsleu.vi v25, v8, -7, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 16 x i1> @llvm.riscv.vmsltu.mask.nxv16i8.i8(
@@ -1966,9 +1976,11 @@ entry:
 define <vscale x 32 x i1> @intrinsic_vmsltu_mask_vi_nxv32i8_i8(<vscale x 32 x i1> %0, <vscale x 32 x i8> %1, <vscale x 32 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vi_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e8,m4,tu,mu
-; CHECK-NEXT:    vmsleu.vi v12, v8, -5, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmsleu.vi v25, v8, -5, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 32 x i1> @llvm.riscv.vmsltu.mask.nxv32i8.i8(
@@ -1999,9 +2011,11 @@ entry:
 define <vscale x 1 x i1> @intrinsic_vmsltu_mask_vi_nxv1i16_i16(<vscale x 1 x i1> %0, <vscale x 1 x i16> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vi_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e16,mf4,tu,mu
-; CHECK-NEXT:    vmsleu.vi v9, v8, -3, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsleu.vi v25, v8, -3, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 1 x i1> @llvm.riscv.vmsltu.mask.nxv1i16.i16(
@@ -2032,9 +2046,11 @@ entry:
 define <vscale x 2 x i1> @intrinsic_vmsltu_mask_vi_nxv2i16_i16(<vscale x 2 x i1> %0, <vscale x 2 x i16> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vi_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e16,mf2,tu,mu
-; CHECK-NEXT:    vmsne.vv v9, v8, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsne.vv v25, v8, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 2 x i1> @llvm.riscv.vmsltu.mask.nxv2i16.i16(
@@ -2065,9 +2081,11 @@ entry:
 define <vscale x 4 x i1> @intrinsic_vmsltu_mask_vi_nxv4i16_i16(<vscale x 4 x i1> %0, <vscale x 4 x i16> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vi_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e16,m1,tu,mu
-; CHECK-NEXT:    vmsleu.vi v9, v8, 0, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsleu.vi v25, v8, 0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 4 x i1> @llvm.riscv.vmsltu.mask.nxv4i16.i16(
@@ -2098,9 +2116,11 @@ entry:
 define <vscale x 8 x i1> @intrinsic_vmsltu_mask_vi_nxv8i16_i16(<vscale x 8 x i1> %0, <vscale x 8 x i16> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vi_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e16,m2,tu,mu
-; CHECK-NEXT:    vmsleu.vi v10, v8, 2, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmsleu.vi v25, v8, 2, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 8 x i1> @llvm.riscv.vmsltu.mask.nxv8i16.i16(
@@ -2131,9 +2151,11 @@ entry:
 define <vscale x 16 x i1> @intrinsic_vmsltu_mask_vi_nxv16i16_i16(<vscale x 16 x i1> %0, <vscale x 16 x i16> %1, <vscale x 16 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vi_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e16,m4,tu,mu
-; CHECK-NEXT:    vmsleu.vi v12, v8, 4, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmsleu.vi v25, v8, 4, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 16 x i1> @llvm.riscv.vmsltu.mask.nxv16i16.i16(
@@ -2164,9 +2186,11 @@ entry:
 define <vscale x 1 x i1> @intrinsic_vmsltu_mask_vi_nxv1i32_i32(<vscale x 1 x i1> %0, <vscale x 1 x i32> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vi_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e32,mf2,tu,mu
-; CHECK-NEXT:    vmsleu.vi v9, v8, 6, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsleu.vi v25, v8, 6, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 1 x i1> @llvm.riscv.vmsltu.mask.nxv1i32.i32(
@@ -2197,9 +2221,11 @@ entry:
 define <vscale x 2 x i1> @intrinsic_vmsltu_mask_vi_nxv2i32_i32(<vscale x 2 x i1> %0, <vscale x 2 x i32> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vi_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e32,m1,tu,mu
-; CHECK-NEXT:    vmsleu.vi v9, v8, 8, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsleu.vi v25, v8, 8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 2 x i1> @llvm.riscv.vmsltu.mask.nxv2i32.i32(
@@ -2230,9 +2256,11 @@ entry:
 define <vscale x 4 x i1> @intrinsic_vmsltu_mask_vi_nxv4i32_i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vi_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e32,m2,tu,mu
-; CHECK-NEXT:    vmsleu.vi v10, v8, 10, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmsleu.vi v25, v8, 10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 4 x i1> @llvm.riscv.vmsltu.mask.nxv4i32.i32(
@@ -2263,9 +2291,11 @@ entry:
 define <vscale x 8 x i1> @intrinsic_vmsltu_mask_vi_nxv8i32_i32(<vscale x 8 x i1> %0, <vscale x 8 x i32> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vi_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e32,m4,tu,mu
-; CHECK-NEXT:    vmsleu.vi v12, v8, 12, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmsleu.vi v25, v8, 12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 8 x i1> @llvm.riscv.vmsltu.mask.nxv8i32.i32(
@@ -2296,9 +2326,11 @@ entry:
 define <vscale x 1 x i1> @intrinsic_vmsltu_mask_vi_nxv1i64_i64(<vscale x 1 x i1> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vi_nxv1i64_i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e64,m1,tu,mu
-; CHECK-NEXT:    vmsleu.vi v9, v8, 14, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmsleu.vi v25, v8, 14, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 1 x i1> @llvm.riscv.vmsltu.mask.nxv1i64.i64(
@@ -2329,9 +2361,11 @@ entry:
 define <vscale x 2 x i1> @intrinsic_vmsltu_mask_vi_nxv2i64_i64(<vscale x 2 x i1> %0, <vscale x 2 x i64> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vi_nxv2i64_i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e64,m2,tu,mu
-; CHECK-NEXT:    vmsleu.vi v10, v8, -16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmsleu.vi v25, v8, -16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 2 x i1> @llvm.riscv.vmsltu.mask.nxv2i64.i64(
@@ -2362,9 +2396,11 @@ entry:
 define <vscale x 4 x i1> @intrinsic_vmsltu_mask_vi_nxv4i64_i64(<vscale x 4 x i1> %0, <vscale x 4 x i64> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmsltu_mask_vi_nxv4i64_i64:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmv1r.v v25, v0
 ; CHECK-NEXT:    vsetvli a0, a0, e64,m4,tu,mu
-; CHECK-NEXT:    vmsleu.vi v12, v8, -14, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmsleu.vi v25, v8, -14, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    jalr zero, 0(ra)
 entry:
   %a = call <vscale x 4 x i1> @llvm.riscv.vmsltu.mask.nxv4i64.i64(

From e0fe1c58acfa0bde36afde8354cb31fc1e0b75e2 Mon Sep 17 00:00:00 2001
From: Luke Drummond <luke.drummond@codeplay.com>
Date: Wed, 10 Mar 2021 18:14:42 +0000
Subject: [PATCH 223/318] [OpenCL] Respect calling convention for builtin

`__translate_sampler_initializer` has a calling convention of
`spir_func`, but clang generated calls to it using the default CC.

Instruction Combining was lowering these mismatching calling conventions
to `store i1* undef` which itself was subsequently lowered to a trap
instruction by simplifyCFG resulting in runtime `SIGILL`

There are arguably two bugs here: but whether there's any wisdom in
converting an obviously invalid call into a runtime crash over aborting
with a sensible error message will require further discussion. So for
now it's enough to set the right calling convention on the runtime
helper.

Reviewed By: svenh, bader

Differential Revision: https://reviews.llvm.org/D98411

(cherry picked from commit fcfd3fda71905d7c48f75a531c2265ad3b9876ea)
---
 clang/lib/CodeGen/CodeGenModule.cpp | 12 +++++++-----
 clang/test/CodeGenOpenCL/sampler.cl | 12 ++++++------
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 31afbc6b4262..9c9bd4e374af 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -6215,15 +6215,17 @@ llvm::SanitizerStatReport &CodeGenModule::getSanStats() {
 
   return *SanStats;
 }
+
 llvm::Value *
 CodeGenModule::createOpenCLIntToSamplerConversion(const Expr *E,
                                                   CodeGenFunction &CGF) {
   llvm::Constant *C = ConstantEmitter(CGF).emitAbstract(E, E->getType());
-  auto SamplerT = getOpenCLRuntime().getSamplerType(E->getType().getTypePtr());
-  auto FTy = llvm::FunctionType::get(SamplerT, {C->getType()}, false);
-  return CGF.Builder.CreateCall(CreateRuntimeFunction(FTy,
-                                "__translate_sampler_initializer"),
-                                {C});
+  auto *SamplerT = getOpenCLRuntime().getSamplerType(E->getType().getTypePtr());
+  auto *FTy = llvm::FunctionType::get(SamplerT, {C->getType()}, false);
+  auto *Call = CGF.Builder.CreateCall(
+      CreateRuntimeFunction(FTy, "__translate_sampler_initializer"), {C});
+  Call->setCallingConv(Call->getCalledFunction()->getCallingConv());
+  return Call;
 }
 
 CharUnits CodeGenModule::getNaturalPointeeTypeAlignment(
diff --git a/clang/test/CodeGenOpenCL/sampler.cl b/clang/test/CodeGenOpenCL/sampler.cl
index e6bda49f51c8..5ad8d0dbbf37 100644
--- a/clang/test/CodeGenOpenCL/sampler.cl
+++ b/clang/test/CodeGenOpenCL/sampler.cl
@@ -39,7 +39,7 @@ kernel void foo(sampler_t smp_par) {
   // Case 2b
   sampler_t smp = CLK_ADDRESS_CLAMP_TO_EDGE | CLK_NORMALIZED_COORDS_TRUE | CLK_FILTER_NEAREST;
   // CHECK: [[smp_ptr:%[A-Za-z0-9_\.]+]] = alloca %opencl.sampler_t addrspace(2)*
-  // CHECK: [[SAMP:%[0-9]+]] = call %opencl.sampler_t addrspace(2)* @__translate_sampler_initializer(i32 19)
+  // CHECK: [[SAMP:%[0-9]+]] = call spir_func %opencl.sampler_t addrspace(2)* @__translate_sampler_initializer(i32 19)
   // CHECK: store %opencl.sampler_t addrspace(2)* [[SAMP]], %opencl.sampler_t addrspace(2)** [[smp_ptr]]
 
   // Case 1b
@@ -56,12 +56,12 @@ kernel void foo(sampler_t smp_par) {
 
   // Case 1a/2a
   fnc4smp(glb_smp);
-  // CHECK: [[SAMP:%[0-9]+]] = call %opencl.sampler_t addrspace(2)* @__translate_sampler_initializer(i32 35)
+  // CHECK: [[SAMP:%[0-9]+]] = call spir_func %opencl.sampler_t addrspace(2)* @__translate_sampler_initializer(i32 35)
   // CHECK: call spir_func void [[FUNCNAME]](%opencl.sampler_t addrspace(2)* [[SAMP]])
 
   // Case 1a/2c
   fnc4smp(glb_smp_const);
-  // CHECK: [[SAMP:%[0-9]+]] = call %opencl.sampler_t addrspace(2)* @__translate_sampler_initializer(i32 35)
+  // CHECK: [[SAMP:%[0-9]+]] = call spir_func %opencl.sampler_t addrspace(2)* @__translate_sampler_initializer(i32 35)
   // CHECK: call spir_func void [[FUNCNAME]](%opencl.sampler_t addrspace(2)* [[SAMP]])
 
   // Case 1c
@@ -70,12 +70,12 @@ kernel void foo(sampler_t smp_par) {
   // CHECK: call spir_func void [[FUNCNAME]](%opencl.sampler_t addrspace(2)* [[SAMP]])
 
   fnc4smp(5);
-  // CHECK: [[SAMP:%[0-9]+]] = call %opencl.sampler_t addrspace(2)* @__translate_sampler_initializer(i32 5)
+  // CHECK: [[SAMP:%[0-9]+]] = call spir_func %opencl.sampler_t addrspace(2)* @__translate_sampler_initializer(i32 5)
   // CHECK: call spir_func void [[FUNCNAME]](%opencl.sampler_t addrspace(2)* [[SAMP]])
 
   const sampler_t const_smp = CLK_ADDRESS_CLAMP_TO_EDGE | CLK_NORMALIZED_COORDS_TRUE | CLK_FILTER_LINEAR;
   fnc4smp(const_smp);
-   // CHECK: [[CONST_SAMP:%[0-9]+]] = call %opencl.sampler_t addrspace(2)* @__translate_sampler_initializer(i32 35)
+   // CHECK: [[CONST_SAMP:%[0-9]+]] = call spir_func %opencl.sampler_t addrspace(2)* @__translate_sampler_initializer(i32 35)
   // CHECK: store %opencl.sampler_t addrspace(2)* [[CONST_SAMP]], %opencl.sampler_t addrspace(2)** [[CONST_SMP_PTR:%[a-zA-Z0-9]+]]
   fnc4smp(const_smp);
   // CHECK: [[SAMP:%[0-9]+]] = load %opencl.sampler_t addrspace(2)*, %opencl.sampler_t addrspace(2)** [[CONST_SMP_PTR]]
@@ -83,7 +83,7 @@ kernel void foo(sampler_t smp_par) {
 
   constant sampler_t constant_smp = CLK_ADDRESS_CLAMP_TO_EDGE | CLK_NORMALIZED_COORDS_TRUE | CLK_FILTER_LINEAR;
   fnc4smp(constant_smp);
-  // CHECK: [[SAMP:%[0-9]+]] = call %opencl.sampler_t addrspace(2)* @__translate_sampler_initializer(i32 35)
+  // CHECK: [[SAMP:%[0-9]+]] = call spir_func %opencl.sampler_t addrspace(2)* @__translate_sampler_initializer(i32 35)
   // CHECK: call spir_func void [[FUNCNAME]](%opencl.sampler_t addrspace(2)* [[SAMP]])
 
   // TODO: enable sampler initialization with non-constant integer.

From a5a6cfe2f030e81e689ed9af4e95ddf95c4d8675 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 4 Mar 2021 12:58:22 -0800
Subject: [PATCH 224/318] BPF: permit type modifiers for
 __builtin_btf_type_id() relocation

Lorenz Bauer from Cloudflare tried to use "const struct <name>"
as the type for __builtin_btf_type_id(*(const struct <name>)0, 1)
relocation and hit a llvm BPF fatal error.
   https://lore.kernel.org/bpf/a3782f71-3f6b-1e75-17a9-1827822c2030@fb.com/

   ...
   fatal error: error in backend: Empty type name for BTF_TYPE_ID_REMOTE reloc

Currently, we require the debuginfo type itself must have a name.
In this case, the debuginfo type is "const" which points to "struct <name>".
The "const" type does not have a name, hence the above fatal error
will be triggered.

Let us permit "const" and "volatile" type modifiers. We skip modifiers
in some other cases as well like structure member type tracing.
This can aviod the above fatal error.

Differential Revision: https://reviews.llvm.org/D97986

(cherry picked from commit 9c0274cdeae904089806be6faee72b9126d2cf5b)
---
 llvm/lib/Target/BPF/BPFPreserveDIType.cpp     |  9 +++
 .../CodeGen/BPF/BTF/builtin-btf-type-id-2.ll  | 73 +++++++++++++++++++
 2 files changed, 82 insertions(+)
 create mode 100644 llvm/test/CodeGen/BPF/BTF/builtin-btf-type-id-2.ll

diff --git a/llvm/lib/Target/BPF/BPFPreserveDIType.cpp b/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
index 18a4f60c171a..0348e2200acb 100644
--- a/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
+++ b/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
@@ -85,8 +85,17 @@ static bool BPFPreserveDITypeImpl(Function &F) {
     } else {
       Reloc = BPFCoreSharedInfo::BTF_TYPE_ID_REMOTE;
       DIType *Ty = cast<DIType>(MD);
+      while (auto *DTy = dyn_cast<DIDerivedType>(Ty)) {
+        unsigned Tag = DTy->getTag();
+        if (Tag != dwarf::DW_TAG_const_type &&
+            Tag != dwarf::DW_TAG_volatile_type)
+          break;
+        Ty = DTy->getBaseType();
+      }
+
       if (Ty->getName().empty())
         report_fatal_error("Empty type name for BTF_TYPE_ID_REMOTE reloc");
+      MD = Ty;
     }
 
     BasicBlock *BB = Call->getParent();
diff --git a/llvm/test/CodeGen/BPF/BTF/builtin-btf-type-id-2.ll b/llvm/test/CodeGen/BPF/BTF/builtin-btf-type-id-2.ll
new file mode 100644
index 000000000000..63c56c4dfec5
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/BTF/builtin-btf-type-id-2.ll
@@ -0,0 +1,73 @@
+; RUN: opt -O2 -mtriple=bpf-pc-linux -S -o %t1 %s
+; RUN: llc -filetype=asm -o - %t1 | FileCheck -check-prefixes=CHECK %s
+; RUN: llc -mattr=+alu32 -filetype=asm -o - %t1 | FileCheck -check-prefixes=CHECK %s
+; Source code:
+;   struct s {
+;     int a;
+;   };
+;   int test(void) {
+;     return __builtin_btf_type_id(*(const struct s *)0, 1);
+;   }
+; Compilation flag:
+;   clang -target bpf -O2 -g -S -emit-llvm -Xclang -disable-llvm-passes test.c
+
+; Function Attrs: nounwind
+define dso_local i32 @test() #0 !dbg !7 {
+entry:
+  %0 = call i64 @llvm.bpf.btf.type.id(i32 0, i64 1), !dbg !11, !llvm.preserve.access.index !12
+  %conv = trunc i64 %0 to i32, !dbg !11
+  ret i32 %conv, !dbg !16
+}
+
+; CHECK:             .long   1                               # BTF_KIND_INT(id = 2)
+; CHECK-NEXT:        .long   16777216                        # 0x1000000
+; CHECK-NEXT:        .long   4
+; CHECK-NEXT:        .long   16777248                        # 0x1000020
+
+; CHECK:             .long   16                              # BTF_KIND_STRUCT(id = 4)
+; CHECK-NEXT:        .long   67108865                        # 0x4000001
+; CHECK-NEXT:        .long   4
+; CHECK-NEXT:        .long   18
+; CHECK-NEXT:        .long   2
+
+; CHECK:             .ascii  "int"                           # string offset=1
+; CHECK:             .ascii  ".text"                         # string offset=10
+; CHECK:             .byte   115                             # string offset=16
+; CHECK:             .byte   97                              # string offset=18
+; CHECK:             .byte   48                              # string offset=20
+
+; CHECK:             .long   16                              # FieldReloc
+; CHECK-NEXT:        .long   10                              # Field reloc section string offset=10
+; CHECK-NEXT:        .long   1
+; CHECK-NEXT:        .long   .Ltmp{{[0-9]+}}
+; CHECK-NEXT:        .long   4
+; CHECK-NEXT:        .long   20
+; CHECK-NEXT:        .long   7
+
+; Function Attrs: nounwind readnone
+declare i64 @llvm.bpf.btf.type.id(i32, i64) #1
+
+attributes #0 = { nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 13.0.0 (https://github.com/llvm/llvm-project.git 9783e2098800b954c55ae598a1ce5c4b93444fc0)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "test.c", directory: "/tmp/home/yhs/bpf/test")
+!2 = !{}
+!3 = !{i32 7, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 13.0.0 (https://github.com/llvm/llvm-project.git 9783e2098800b954c55ae598a1ce5c4b93444fc0)"}
+!7 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 4, type: !8, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!8 = !DISubroutineType(types: !9)
+!9 = !{!10}
+!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!11 = !DILocation(line: 5, column: 10, scope: !7)
+!12 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !13)
+!13 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "s", file: !1, line: 1, size: 32, elements: !14)
+!14 = !{!15}
+!15 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !13, file: !1, line: 2, baseType: !10, size: 32)
+!16 = !DILocation(line: 5, column: 3, scope: !7)

From 6564e0cf7e61518cb15443fca42bc2206a6123e2 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 2 Mar 2021 09:35:21 -0800
Subject: [PATCH 225/318] BPF: Fix a bug in peephole TRUNC elimination
 optimization

Andrei Matei reported a llvm11 core dump for his bpf program
   https://bugs.llvm.org/show_bug.cgi?id=48578
The core dump happens in LiveVariables analysis phase.
  #4 0x00007fce54356bb0 __restore_rt
  #5 0x00007fce4d51785e llvm::LiveVariables::HandleVirtRegUse(unsigned int,
      llvm::MachineBasicBlock*, llvm::MachineInstr&)
  #6 0x00007fce4d519abe llvm::LiveVariables::runOnInstr(llvm::MachineInstr&,
      llvm::SmallVectorImpl<unsigned int>&)
  #7 0x00007fce4d519ec6 llvm::LiveVariables::runOnBlock(llvm::MachineBasicBlock*, unsigned int)
  #8 0x00007fce4d51a4bf llvm::LiveVariables::runOnMachineFunction(llvm::MachineFunction&)
The bug can be reproduced with llvm12 and latest trunk as well.

Futher analysis shows that there is a bug in BPF peephole
TRUNC elimination optimization, which tries to remove
unnecessary TRUNC operations (a <<= 32; a >>= 32).
Specifically, the compiler did wrong transformation for the
following patterns:
   %1 = LDW ...
   %2 = SLL_ri %1, 32
   %3 = SRL_ri %2, 32
   ... %3 ...
   %4 = SRA_ri %2, 32
   ... %4 ...

The current transformation did not check how many uses of %2
and did transformation like
   %1 = LDW ...
   ... %1 ...
   %4 = SRL_ri %2, 32
   ... %4 ...
and pseudo register %2 is used by not defined and
caused LiveVariables analysis core dump.

To fix the issue, when traversing back from SRL_ri to SLL_ri,
check to ensure SLL_ri has only one use. Otherwise, don't
do transformation.

Differential Revision: https://reviews.llvm.org/D97792

(cherry picked from commit 51cdb780db3b9b46c783efcec672c4da272e9992)
---
 llvm/lib/Target/BPF/BPFMIPeephole.cpp      |  3 ++
 llvm/test/CodeGen/BPF/remove_truncate_8.ll | 41 ++++++++++++++++++++++
 2 files changed, 44 insertions(+)
 create mode 100644 llvm/test/CodeGen/BPF/remove_truncate_8.ll

diff --git a/llvm/lib/Target/BPF/BPFMIPeephole.cpp b/llvm/lib/Target/BPF/BPFMIPeephole.cpp
index df870314fffe..354980e4bf3c 100644
--- a/llvm/lib/Target/BPF/BPFMIPeephole.cpp
+++ b/llvm/lib/Target/BPF/BPFMIPeephole.cpp
@@ -475,6 +475,9 @@ bool BPFMIPeepholeTruncElim::eliminateTruncSeq(void) {
       if (MI.getOpcode() == BPF::SRL_ri &&
           MI.getOperand(2).getImm() == 32) {
         SrcReg = MI.getOperand(1).getReg();
+        if (!MRI->hasOneNonDBGUse(SrcReg))
+          continue;
+
         MI2 = MRI->getVRegDef(SrcReg);
         DstReg = MI.getOperand(0).getReg();
 
diff --git a/llvm/test/CodeGen/BPF/remove_truncate_8.ll b/llvm/test/CodeGen/BPF/remove_truncate_8.ll
new file mode 100644
index 000000000000..fb1eabb0f0fd
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/remove_truncate_8.ll
@@ -0,0 +1,41 @@
+; RUN: llc < %s -march=bpf -verify-machineinstrs | FileCheck %s
+; Source Code:
+;   struct loc_prog {
+;     unsigned int ip;
+;     int len;
+;   };
+;   int exec_prog(struct loc_prog *prog) {
+;     if (prog->ip < prog->len) {
+;       int x = prog->ip;
+;       if (x < 3)
+;         prog->ip += 2;
+;     }
+;     return 3;
+;   }
+; Compilation flag:
+;   clang -target bpf -O2 -S -emit-llvm t.c
+
+%struct.loc_prog = type { i32, i32 }
+
+; Function Attrs: nofree norecurse nounwind willreturn
+define dso_local i32 @exec_prog(%struct.loc_prog* nocapture %prog) local_unnamed_addr {
+entry:
+  %ip = getelementptr inbounds %struct.loc_prog, %struct.loc_prog* %prog, i64 0, i32 0
+  %0 = load i32, i32* %ip, align 4
+  %len = getelementptr inbounds %struct.loc_prog, %struct.loc_prog* %prog, i64 0, i32 1
+  %1 = load i32, i32* %len, align 4
+  %cmp = icmp ult i32 %0, %1
+  %cmp2 = icmp slt i32 %0, 3
+  %or.cond = and i1 %cmp2, %cmp
+; CHECK: r{{[0-9]+}} <<= 32
+; CHECK: r{{[0-9]+}} s>>= 32
+  br i1 %or.cond, label %if.then3, label %if.end5
+
+if.then3:                                         ; preds = %entry
+  %add = add nsw i32 %0, 2
+  store i32 %add, i32* %ip, align 4
+  br label %if.end5
+
+if.end5:                                          ; preds = %if.then3, %entry
+  ret i32 3
+}

From e294ece42d85191875782ed05cb607451f493944 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sun, 28 Feb 2021 22:46:39 -0800
Subject: [PATCH 226/318] BPF: fix enum value 0 issue for
 __builtin_preserve_enum_value()

Lorenz Bauer reported that the following code will have
compilation error for bpf target:
    enum e { TWO };
    bpf_core_enum_value_exists(enum e, TWO);
The clang emitted the following error message:
    __builtin_preserve_enum_value argument 1 invalid

In SemaChecking, an expression like "*(enum NAME)1" will have
cast kind CK_IntegralToPointer, but "*(enum NAME)0" will have
cast kind CK_NullToPointer. Current implementation only permits
CK_IntegralToPointer, missing enum value 0 case.

This patch permits CK_NullToPointer cast kind and
the above test case can pass now.

Differential Revision: https://reviews.llvm.org/D97659

(cherry picked from commit 283db5f0837d55f91242812003adf6e189ba743e)
---
 clang/lib/Sema/SemaChecking.cpp                      |  5 ++++-
 .../CodeGen/builtins-bpf-preserve-field-info-4.c     | 12 +++++++++++-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 2d3d36f4adad..2b55712d44c2 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2623,7 +2623,10 @@ static bool isValidBPFPreserveEnumValueArg(Expr *Arg) {
     return false;
 
   const auto *CE = dyn_cast<CStyleCastExpr>(UO->getSubExpr());
-  if (!CE || CE->getCastKind() != CK_IntegralToPointer)
+  if (!CE)
+    return false;
+  if (CE->getCastKind() != CK_IntegralToPointer &&
+      CE->getCastKind() != CK_NullToPointer)
     return false;
 
   // The integer must be from an EnumConstantDecl.
diff --git a/clang/test/CodeGen/builtins-bpf-preserve-field-info-4.c b/clang/test/CodeGen/builtins-bpf-preserve-field-info-4.c
index e07c680bb370..b167b776e385 100644
--- a/clang/test/CodeGen/builtins-bpf-preserve-field-info-4.c
+++ b/clang/test/CodeGen/builtins-bpf-preserve-field-info-4.c
@@ -4,10 +4,11 @@
 #define _(x, y) (__builtin_preserve_enum_value((x), (y)))
 
 enum AA {
+  VAL0 = 0,
   VAL1 = 2,
   VAL2 = 0xffffffff80000000UL,
 };
-typedef enum { VAL10 = -2, VAL11 = 0xffff8000, }  __BB;
+typedef enum { VAL00, VAL10 = -2, VAL11 = 0xffff8000, }  __BB;
 
 unsigned unit1() {
   return _(*(enum AA *)VAL1, 0) + _(*(__BB *)VAL10, 1);
@@ -17,10 +18,16 @@ unsigned unit2() {
   return _(*(enum AA *)VAL2, 0) + _(*(__BB *)VAL11, 1);
 }
 
+unsigned unit3() {
+  return _(*(enum AA *)VAL0, 0) + _(*(__BB *)VAL00, 1);
+}
+
 // CHECK: @0 = private unnamed_addr constant [7 x i8] c"VAL1:2\00", align 1
 // CHECK: @1 = private unnamed_addr constant [9 x i8] c"VAL10:-2\00", align 1
 // CHECK: @2 = private unnamed_addr constant [17 x i8] c"VAL2:-2147483648\00", align 1
 // CHECK: @3 = private unnamed_addr constant [17 x i8] c"VAL11:4294934528\00", align 1
+// CHECK: @4 = private unnamed_addr constant [7 x i8] c"VAL0:0\00", align 1
+// CHECK: @5 = private unnamed_addr constant [8 x i8] c"VAL00:0\00", align 1
 
 // CHECK: call i64 @llvm.bpf.preserve.enum.value(i32 0, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @0, i32 0, i32 0), i64 0), !dbg !{{[0-9]+}}, !llvm.preserve.access.index ![[ENUM_AA:[0-9]+]]
 // CHECK: call i64 @llvm.bpf.preserve.enum.value(i32 1, i8* getelementptr inbounds ([9 x i8], [9 x i8]* @1, i32 0, i32 0), i64 1), !dbg !{{[0-9]+}}, !llvm.preserve.access.index ![[TYPEDEF_ENUM:[0-9]+]]
@@ -28,5 +35,8 @@ unsigned unit2() {
 // CHECK: call i64 @llvm.bpf.preserve.enum.value(i32 2, i8* getelementptr inbounds ([17 x i8], [17 x i8]* @2, i32 0, i32 0), i64 0), !dbg !{{[0-9]+}}, !llvm.preserve.access.index ![[ENUM_AA]]
 // CHECK: call i64 @llvm.bpf.preserve.enum.value(i32 3, i8* getelementptr inbounds ([17 x i8], [17 x i8]* @3, i32 0, i32 0), i64 1), !dbg !{{[0-9]+}}, !llvm.preserve.access.index ![[TYPEDEF_ENUM]]
 
+// CHECK: call i64 @llvm.bpf.preserve.enum.value(i32 4, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @4, i32 0, i32 0), i64 0), !dbg !{{[0-9]+}}, !llvm.preserve.access.index ![[ENUM_AA]]
+// CHECK: call i64 @llvm.bpf.preserve.enum.value(i32 5, i8* getelementptr inbounds ([8 x i8], [8 x i8]* @5, i32 0, i32 0), i64 1), !dbg !{{[0-9]+}}, !llvm.preserve.access.index ![[TYPEDEF_ENUM]]
+
 // CHECK: ![[ENUM_AA]] = !DICompositeType(tag: DW_TAG_enumeration_type, name: "AA"
 // CHECK: ![[TYPEDEF_ENUM]] = !DIDerivedType(tag: DW_TAG_typedef, name: "__BB"

From 6baa5ce2e4b70424981ba5632d10681d41f57cfc Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Wed, 5 May 2021 00:55:36 -0400
Subject: [PATCH 227/318] Fix typo, arvm7 -> armv7

(cherry picked from commit 3a62d4fde88544125ce9ceff990db108ee91148a)
---
 lldb/docs/man/lldb.rst       | 4 ++--
 lldb/tools/driver/Driver.cpp | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/lldb/docs/man/lldb.rst b/lldb/docs/man/lldb.rst
index 6dca15fa35dc..b75288db380d 100644
--- a/lldb/docs/man/lldb.rst
+++ b/lldb/docs/man/lldb.rst
@@ -256,11 +256,11 @@ executable. To disambiguate between arguments passed to lldb and arguments
 passed to the debugged executable, arguments starting with a - must be passed
 after --.
 
-  lldb --arch x86_64 /path/to/program program argument -- --arch arvm7
+  lldb --arch x86_64 /path/to/program program argument -- --arch armv7
 
 For convenience, passing the executable after -- is also supported.
 
-  lldb --arch x86_64 -- /path/to/program program argument --arch arvm7
+  lldb --arch x86_64 -- /path/to/program program argument --arch armv7
 
 Passing one of the attach options causes :program:`lldb` to immediately attach
 to the given process.
diff --git a/lldb/tools/driver/Driver.cpp b/lldb/tools/driver/Driver.cpp
index e4a60127b65e..210a712f9741 100644
--- a/lldb/tools/driver/Driver.cpp
+++ b/lldb/tools/driver/Driver.cpp
@@ -751,11 +751,11 @@ static void printHelp(LLDBOptTable &table, llvm::StringRef tool_name) {
   arguments passed to the debugged executable, arguments starting with a - must
   be passed after --.
 
-    lldb --arch x86_64 /path/to/program program argument -- --arch arvm7
+    lldb --arch x86_64 /path/to/program program argument -- --arch armv7
 
   For convenience, passing the executable after -- is also supported.
 
-    lldb --arch x86_64 -- /path/to/program program argument --arch arvm7
+    lldb --arch x86_64 -- /path/to/program program argument --arch armv7
 
   Passing one of the attach options causes lldb to immediately attach to the
   given process.

From 471a386a3d348e933d200e1cc01413aa655d508e Mon Sep 17 00:00:00 2001
From: Tim Northover <t.p.northover@gmail.com>
Date: Wed, 17 Mar 2021 10:48:20 +0000
Subject: [PATCH 228/318] StackProtector: ensure protection does not interfere
 with tail call frame.

The IR stack protector pass must insert stack checks before the call instead of
between it and the return.

Similarly, SDAG one should recognize that ADJCALLFRAME instructions could be
part of the terminal sequence of a tail call. In this case because such call
frames cannot be nested in LLVM the stack protection code must skip over the
whole sequence (or risk clobbering argument registers).

(cherry picked from commit 5e3d9fcc3a8802cea5b850a3ca40c515d916bf82)
---
 .../CodeGen/SelectionDAG/SelectionDAGISel.cpp | 33 ++++++++--
 llvm/lib/CodeGen/StackProtector.cpp           | 24 +++++--
 .../AArch64/stack-protector-musttail.ll       | 66 +++++++++++++++++++
 .../ARM/Windows/stack-protector-musttail.ll   | 56 ++++++++++++++++
 llvm/test/CodeGen/X86/tailcc-ssp.ll           | 26 ++++++++
 5 files changed, 197 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/stack-protector-musttail.ll
 create mode 100644 llvm/test/CodeGen/ARM/Windows/stack-protector-musttail.ll
 create mode 100644 llvm/test/CodeGen/X86/tailcc-ssp.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 7bae5048fc0e..d17dd1c5eccb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -1691,9 +1691,9 @@ static bool MIIsInTerminatorSequence(const MachineInstr &MI) {
 /// terminator, but additionally the copies that move the vregs into the
 /// physical registers.
 static MachineBasicBlock::iterator
-FindSplitPointForStackProtector(MachineBasicBlock *BB) {
+FindSplitPointForStackProtector(MachineBasicBlock *BB,
+                                const TargetInstrInfo &TII) {
   MachineBasicBlock::iterator SplitPoint = BB->getFirstTerminator();
-  //
   if (SplitPoint == BB->begin())
     return SplitPoint;
 
@@ -1701,6 +1701,31 @@ FindSplitPointForStackProtector(MachineBasicBlock *BB) {
   MachineBasicBlock::iterator Previous = SplitPoint;
   --Previous;
 
+  if (TII.isTailCall(*SplitPoint) &&
+      Previous->getOpcode() == TII.getCallFrameDestroyOpcode()) {
+    // call itself, then we must insert before the sequence even starts. For
+    // example:
+    //     <split point>
+    //     ADJCALLSTACKDOWN ...
+    //     <Moves>
+    //     ADJCALLSTACKUP ...
+    //     TAILJMP somewhere
+    // On the other hand, it could be an unrelated call in which case this tail call
+    // has to register moves of its own and should be the split point. For example:
+    //     ADJCALLSTACKDOWN
+    //     CALL something_else
+    //     ADJCALLSTACKUP
+    //     <split point>
+    //     TAILJMP somewhere
+    do {
+      --Previous;
+      if (Previous->isCall())
+        return SplitPoint;
+    } while(Previous->getOpcode() != TII.getCallFrameSetupOpcode());
+
+    return Previous;
+  }
+
   while (MIIsInTerminatorSequence(*Previous)) {
     SplitPoint = Previous;
     if (Previous == Start)
@@ -1740,7 +1765,7 @@ SelectionDAGISel::FinishBasicBlock() {
     // Add load and check to the basicblock.
     FuncInfo->MBB = ParentMBB;
     FuncInfo->InsertPt =
-        FindSplitPointForStackProtector(ParentMBB);
+        FindSplitPointForStackProtector(ParentMBB, *TII);
     SDB->visitSPDescriptorParent(SDB->SPDescriptor, ParentMBB);
     CurDAG->setRoot(SDB->getRoot());
     SDB->clear();
@@ -1759,7 +1784,7 @@ SelectionDAGISel::FinishBasicBlock() {
     // register allocation issues caused by us splitting the parent mbb. The
     // register allocator will clean up said virtual copies later on.
     MachineBasicBlock::iterator SplitPoint =
-        FindSplitPointForStackProtector(ParentMBB);
+        FindSplitPointForStackProtector(ParentMBB, *TII);
 
     // Splice the terminator of ParentMBB into SuccessMBB.
     SuccessMBB->splice(SuccessMBB->end(), ParentMBB,
diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp
index 8d91afb6e99d..10c6dcbdb049 100644
--- a/llvm/lib/CodeGen/StackProtector.cpp
+++ b/llvm/lib/CodeGen/StackProtector.cpp
@@ -470,21 +470,36 @@ bool StackProtector::InsertStackProtectors() {
     // instrumentation has already been generated.
     HasIRCheck = true;
 
+    // If we're instrumenting a block with a musttail call, the check has to be
+    // inserted before the call rather than between it and the return. The
+    // verifier guarantees that a musttail call is either directly before the
+    // return or with a single correct bitcast of the return value in between so
+    // we don't need to worry about many situations here.
+    Instruction *CheckLoc = RI;
+    Instruction *Prev = RI->getPrevNonDebugInstruction();
+    if (Prev && isa<CallInst>(Prev) && cast<CallInst>(Prev)->isMustTailCall())
+      CheckLoc = Prev;
+    else if (Prev) {
+      Prev = Prev->getPrevNonDebugInstruction();
+      if (Prev && isa<CallInst>(Prev) && cast<CallInst>(Prev)->isMustTailCall())
+        CheckLoc = Prev;
+    }
+
     // Generate epilogue instrumentation. The epilogue intrumentation can be
     // function-based or inlined depending on which mechanism the target is
     // providing.
     if (Function *GuardCheck = TLI->getSSPStackGuardCheck(*M)) {
       // Generate the function-based epilogue instrumentation.
       // The target provides a guard check function, generate a call to it.
-      IRBuilder<> B(RI);
+      IRBuilder<> B(CheckLoc);
       LoadInst *Guard = B.CreateLoad(B.getInt8PtrTy(), AI, true, "Guard");
       CallInst *Call = B.CreateCall(GuardCheck, {Guard});
       Call->setAttributes(GuardCheck->getAttributes());
       Call->setCallingConv(GuardCheck->getCallingConv());
     } else {
       // Generate the epilogue with inline instrumentation.
-      // If we do not support SelectionDAG based tail calls, generate IR level
-      // tail calls.
+      // If we do not support SelectionDAG based calls, generate IR level
+      // calls.
       //
       // For each block with a return instruction, convert this:
       //
@@ -514,7 +529,8 @@ bool StackProtector::InsertStackProtectors() {
       BasicBlock *FailBB = CreateFailBB();
 
       // Split the basic block before the return instruction.
-      BasicBlock *NewBB = BB->splitBasicBlock(RI->getIterator(), "SP_return");
+      BasicBlock *NewBB =
+          BB->splitBasicBlock(CheckLoc->getIterator(), "SP_return");
 
       // Update the dominator tree if we need to.
       if (DT && DT->isReachableFromEntry(BB)) {
diff --git a/llvm/test/CodeGen/AArch64/stack-protector-musttail.ll b/llvm/test/CodeGen/AArch64/stack-protector-musttail.ll
new file mode 100644
index 000000000000..8a2e095e6a64
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/stack-protector-musttail.ll
@@ -0,0 +1,66 @@
+; RUN: llc -mtriple=arm64-apple-macosx -fast-isel %s -o - -start-before=stack-protector -stop-after=stack-protector  | FileCheck %s
+
+@var = global [2 x i64]* null
+
+declare void @callee()
+
+define void @caller1() ssp {
+; CHECK-LABEL: define void @caller1()
+; Prologue:
+; CHECK: @llvm.stackguard
+
+; CHECK: [[GUARD:%.*]] = call i8* @llvm.stackguard()
+; CHECK: [[TOKEN:%.*]] = load volatile i8*, i8** {{%.*}}
+; CHECK: [[TST:%.*]] = icmp eq i8* [[GUARD]], [[TOKEN]]
+; CHECK: br i1 [[TST]]
+
+; CHECK: musttail call void @callee()
+; CHECK-NEXT: ret void
+  %var = alloca [2 x i64]
+  store [2 x i64]* %var, [2 x i64]** @var
+  musttail call void @callee()
+  ret void
+}
+
+define void @justret() ssp {
+; CHECK-LABEL: define void @justret()
+; Prologue:
+; CHECK: @llvm.stackguard
+
+; CHECK: [[GUARD:%.*]] = call i8* @llvm.stackguard()
+; CHECK: [[TOKEN:%.*]] = load volatile i8*, i8** {{%.*}}
+; CHECK: [[TST:%.*]] = icmp eq i8* [[GUARD]], [[TOKEN]]
+; CHECK: br i1 [[TST]]
+
+; CHECK: ret void
+  %var = alloca [2 x i64]
+  store [2 x i64]* %var, [2 x i64]** @var
+  br label %retblock
+
+retblock:
+  ret void
+}
+
+
+declare i64* @callee2()
+
+define i8* @caller2() ssp {
+; CHECK-LABEL: define i8* @caller2()
+; Prologue:
+; CHECK: @llvm.stackguard
+
+; CHECK: [[GUARD:%.*]] = call i8* @llvm.stackguard()
+; CHECK: [[TOKEN:%.*]] = load volatile i8*, i8** {{%.*}}
+; CHECK: [[TST:%.*]] = icmp eq i8* [[GUARD]], [[TOKEN]]
+; CHECK: br i1 [[TST]]
+
+; CHECK: [[TMP:%.*]] = musttail call i64* @callee2()
+; CHECK-NEXT: [[RES:%.*]] = bitcast i64* [[TMP]] to i8*
+; CHECK-NEXT: ret i8* [[RES]]
+
+  %var = alloca [2 x i64]
+  store [2 x i64]* %var, [2 x i64]** @var
+  %tmp = musttail call i64* @callee2()
+  %res = bitcast i64* %tmp to i8*
+  ret i8* %res
+}
diff --git a/llvm/test/CodeGen/ARM/Windows/stack-protector-musttail.ll b/llvm/test/CodeGen/ARM/Windows/stack-protector-musttail.ll
new file mode 100644
index 000000000000..93b601c9369f
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/Windows/stack-protector-musttail.ll
@@ -0,0 +1,56 @@
+; RUN: llc -mtriple=thumbv7-windows-msvc -fast-isel %s -o - -start-before=stack-protector -stop-after=stack-protector  | FileCheck %s
+
+@var = global [2 x i64]* null
+
+declare void @callee()
+
+define void @caller1() sspreq {
+; CHECK-LABEL: define void @caller1()
+; Prologue:
+
+; CHECK: call void @__security_check_cookie
+
+; CHECK: musttail call void @callee()
+; CHECK-NEXT: ret void
+  %var = alloca [2 x i64]
+  store [2 x i64]* %var, [2 x i64]** @var
+  musttail call void @callee()
+  ret void
+}
+
+define void @justret() sspreq {
+; CHECK-LABEL: define void @justret()
+; Prologue:
+; CHECK: @llvm.stackguard
+
+; CHECK: call void @__security_check_cookie
+
+; CHECK: ret void
+  %var = alloca [2 x i64]
+  store [2 x i64]* %var, [2 x i64]** @var
+  br label %retblock
+
+retblock:
+  ret void
+}
+
+
+declare i64* @callee2()
+
+define i8* @caller2() sspreq {
+; CHECK-LABEL: define i8* @caller2()
+; Prologue:
+; CHECK: @llvm.stackguard
+
+; CHECK: call void @__security_check_cookie
+
+; CHECK: [[TMP:%.*]] = musttail call i64* @callee2()
+; CHECK-NEXT: [[RES:%.*]] = bitcast i64* [[TMP]] to i8*
+; CHECK-NEXT: ret i8* [[RES]]
+
+  %var = alloca [2 x i64]
+  store [2 x i64]* %var, [2 x i64]** @var
+  %tmp = musttail call i64* @callee2()
+  %res = bitcast i64* %tmp to i8*
+  ret i8* %res
+}
diff --git a/llvm/test/CodeGen/X86/tailcc-ssp.ll b/llvm/test/CodeGen/X86/tailcc-ssp.ll
new file mode 100644
index 000000000000..b85be6a5e790
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tailcc-ssp.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mtriple=x86_64-windows-msvc %s -o - -verify-machineinstrs | FileCheck %s
+
+declare void @h(i8*, i64, i8*)
+
+define tailcc void @tailcall_frame(i8* %0, i64 %1) sspreq {
+; CHECK-LABEL: tailcall_frame:
+; CHECK: callq __security_check_cookie
+; CHECK: xorl %ecx, %ecx
+; CHECK: jmp h
+
+   tail call tailcc void @h(i8* null, i64 0, i8* null)
+   ret void
+}
+
+declare void @bar()
+define void @tailcall_unrelated_frame() sspreq {
+; CHECK-LABEL: tailcall_unrelated_frame:
+; CHECK: subq [[STACK:\$.*]], %rsp
+; CHECK: callq bar
+; CHECK: callq __security_check_cookie
+; CHECK: addq [[STACK]], %rsp
+; CHECK: jmp bar
+  call void @bar()
+  tail call void @bar()
+  ret void
+}

From ac593de16cc5282630ce44dd8378ae5b7b91644c Mon Sep 17 00:00:00 2001
From: KAWASHIMA Takahiro <t-kawashima@fujitsu.com>
Date: Thu, 8 Apr 2021 12:33:25 +0900
Subject: [PATCH 229/318] [LoopReroll] Fix rerolling loop with extra
 instructions

Fixes PR47627

This fix suppresses rerolling a loop which has an unrerollable
instruction.

Sample IR for the explanation below:

```
define void @foo([2 x i32]* nocapture %a) {
entry:
  br label %loop

loop:
  ; base instruction
  %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %loop ]

  ; unrerollable instructions
  %stptrx = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 %indvar, i64 0
  store i32 999, i32* %stptrx, align 4

  ; extra simple arithmetic operations, used by root instructions
  %plus20 = add nuw nsw i64 %indvar, 20
  %plus10 = add nuw nsw i64 %indvar, 10

  ; root instruction 0
  %ldptr0 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 %plus20, i64 0
  %value0 = load i32, i32* %ldptr0, align 4
  %stptr0 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 %plus10, i64 0
  store i32 %value0, i32* %stptr0, align 4

  ; root instruction 1
  %ldptr1 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 %plus20, i64 1
  %value1 = load i32, i32* %ldptr1, align 4
  %stptr1 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 %plus10, i64 1
  store i32 %value1, i32* %stptr1, align 4

  ; loop-increment and latch
  %indvar.next = add nuw nsw i64 %indvar, 1
  %exitcond = icmp eq i64 %indvar.next, 5
  br i1 %exitcond, label %exit, label %loop

exit:
  ret void
}
```

In the loop rerolling pass, `%indvar` and `%indvar.next` are appended
to the `LoopIncs` vector in the `LoopReroll::DAGRootTracker::findRoots`
function.

Before this fix, two instructions with `unrerollable instructions`
comment above are marked as `IL_All` at the end of the
`LoopReroll::DAGRootTracker::collectUsedInstructions` function,
as well as instructions with `extra simple arithmetic operations`
comment and `loop-increment and latch` comment. It is incorrect
because `IL_All` means that the instruction should be executed in all
iterations of the rerolled loop but the `store` instruction should
not.

This fix rejects instructions which may have side effects and don't
belong to def-use chains of any root instructions and reductions.

See https://bugs.llvm.org/show_bug.cgi?id=47627 for more information.

(cherry picked from commit d9a9c992d190dd6645ea911b66cf0cadba0dadc3)
---
 llvm/lib/Transforms/Scalar/LoopRerollPass.cpp |   6 +
 .../test/Transforms/LoopReroll/extra_instr.ll | 268 ++++++++++++++++++
 2 files changed, 274 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopReroll/extra_instr.ll

diff --git a/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
index b3bae47e96de..65a6205f0302 100644
--- a/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -1081,6 +1081,12 @@ bool LoopReroll::DAGRootTracker::collectUsedInstructions(SmallInstructionSet &Po
   DenseSet<Instruction*> V;
   collectInLoopUserSet(LoopIncs, Exclude, PossibleRedSet, V);
   for (auto *I : V) {
+    if (I->mayHaveSideEffects()) {
+      LLVM_DEBUG(dbgs() << "LRR: Aborting - "
+                        << "An instruction which does not belong to any root "
+                        << "sets must not have side effects: " << *I);
+      return false;
+    }
     Uses[I].set(IL_All);
   }
 
diff --git a/llvm/test/Transforms/LoopReroll/extra_instr.ll b/llvm/test/Transforms/LoopReroll/extra_instr.ll
new file mode 100644
index 000000000000..aae29079ade7
--- /dev/null
+++ b/llvm/test/Transforms/LoopReroll/extra_instr.ll
@@ -0,0 +1,268 @@
+; RUN: opt -S  -loop-reroll   %s | FileCheck %s
+target triple = "aarch64--linux-gnu"
+
+define void @rerollable1([2 x i32]* nocapture %a) {
+entry:
+  br label %loop
+
+loop:
+
+; CHECK-LABEL: loop:
+; CHECK-NEXT:    %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr [2 x i32], [2 x i32]* %a, i64 20, i64 %iv
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr [2 x i32], [2 x i32]* %a, i64 10, i64 %iv
+; CHECK-NEXT:    [[VALUE:%.*]] = load i32, i32* [[SCEVGEP1]], align 4
+; CHECK-NEXT:    store i32 [[VALUE]], i32* [[SCEVGEP2]], align 4
+
+  ; base instruction
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+
+  ; NO unrerollable instructions
+
+  ; extra simple arithmetic operations, used by root instructions
+  %plus20 = add nuw nsw i64 %iv, 20
+  %plus10 = add nuw nsw i64 %iv, 10
+
+  ; root instruction 0
+  %ldptr0 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 %plus20, i64 0
+  %value0 = load i32, i32* %ldptr0, align 4
+  %stptr0 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 %plus10, i64 0
+  store i32 %value0, i32* %stptr0, align 4
+
+  ; root instruction 1
+  %ldptr1 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 %plus20, i64 1
+  %value1 = load i32, i32* %ldptr1, align 4
+  %stptr1 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 %plus10, i64 1
+  store i32 %value1, i32* %stptr1, align 4
+
+  ; loop-increment
+  %iv.next = add nuw nsw i64 %iv, 1
+
+  ; latch
+  %exitcond = icmp eq i64 %iv.next, 5
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @unrerollable1([2 x i32]* nocapture %a) {
+entry:
+  br label %loop
+
+loop:
+
+; CHECK-LABEL: loop:
+; CHECK-NEXT:    %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    %stptrx = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 %iv, i64 0
+; CHECK-NEXT:    store i32 999, i32* %stptrx, align 4
+
+  ; base instruction
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+
+  ; unrerollable instructions using %iv
+  %stptrx = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 %iv, i64 0
+  store i32 999, i32* %stptrx, align 4
+
+  ; extra simple arithmetic operations, used by root instructions
+  %plus20 = add nuw nsw i64 %iv, 20
+  %plus10 = add nuw nsw i64 %iv, 10
+
+  ; root instruction 0
+  %ldptr0 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 %plus20, i64 0
+  %value0 = load i32, i32* %ldptr0, align 4
+  %stptr0 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 %plus10, i64 0
+  store i32 %value0, i32* %stptr0, align 4
+
+  ; root instruction 1
+  %ldptr1 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 %plus20, i64 1
+  %value1 = load i32, i32* %ldptr1, align 4
+  %stptr1 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 %plus10, i64 1
+  store i32 %value1, i32* %stptr1, align 4
+
+  ; loop-increment
+  %iv.next = add nuw nsw i64 %iv, 1
+
+  ; latch
+  %exitcond = icmp eq i64 %iv.next, 5
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @unrerollable2([2 x i32]* nocapture %a) {
+entry:
+  br label %loop
+
+loop:
+
+; CHECK-LABEL: loop:
+; CHECK-NEXT:    %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    %iv.next = add nuw nsw i64 %iv, 1
+; CHECK-NEXT:    %stptrx = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 %iv.next, i64 0
+; CHECK-NEXT:    store i32 999, i32* %stptrx, align 4
+
+  ; base instruction
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+
+  ; loop-increment
+  %iv.next = add nuw nsw i64 %iv, 1
+
+  ; unrerollable instructions using %iv.next
+  %stptrx = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 %iv.next, i64 0
+  store i32 999, i32* %stptrx, align 4
+
+  ; extra simple arithmetic operations, used by root instructions
+  %plus20 = add nuw nsw i64 %iv, 20
+  %plus10 = add nuw nsw i64 %iv, 10
+
+  ; root instruction 0
+  %ldptr0 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 %plus20, i64 0
+  %value0 = load i32, i32* %ldptr0, align 4
+  %stptr0 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 %plus10, i64 0
+  store i32 %value0, i32* %stptr0, align 4
+
+  ; root instruction 1
+  %ldptr1 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 %plus20, i64 1
+  %value1 = load i32, i32* %ldptr1, align 4
+  %stptr1 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 %plus10, i64 1
+  store i32 %value1, i32* %stptr1, align 4
+
+  ; latch
+  %exitcond = icmp eq i64 %iv.next, 5
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define dso_local void @rerollable2() {
+entry:
+  br label %loop
+
+loop:
+
+; CHECK-LABEL: loop:
+; CHECK-NEXT:    %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    {{%.*}} = add i32 %iv, {{20|24}}
+; CHECK-NEXT:    {{%.*}} = add i32 %iv, {{20|24}}
+
+  ; induction variable
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+
+  ; scale instruction
+  %iv.mul3 = mul nuw nsw i32 %iv, 3
+
+  ; extra simple arithmetic operations, used by root instructions
+  %iv.scaled = add nuw nsw i32 %iv.mul3, 20
+
+  ; NO unrerollable instructions
+
+  ; root set 1
+
+  ; base instruction
+  %iv.scaled.div5 = udiv i32 %iv.scaled, 5
+  tail call void @bar(i32 %iv.scaled.div5)
+  ; root instruction 0
+  %iv.scaled.add1 = add nuw nsw i32 %iv.scaled, 1
+  %iv.scaled.add1.div5 = udiv i32 %iv.scaled.add1, 5
+  tail call void @bar(i32 %iv.scaled.add1.div5)
+  ; root instruction 2
+  %iv.scaled.add2 = add nuw nsw i32 %iv.scaled, 2
+  %iv.scaled.add2.div5 = udiv i32 %iv.scaled.add2, 5
+  tail call void @bar(i32 %iv.scaled.add2.div5)
+
+  ; root set 2
+
+  ; base instruction
+  %iv.scaled.add4 = add nuw nsw i32 %iv.scaled, 4
+  %iv.scaled.add4.div5 = udiv i32 %iv.scaled.add4, 5
+  tail call void @bar(i32 %iv.scaled.add4.div5)
+  ; root instruction 0
+  %iv.scaled.add5 = add nuw nsw i32 %iv.scaled, 5
+  %iv.scaled.add5.div5 = udiv i32 %iv.scaled.add5, 5
+  tail call void @bar(i32 %iv.scaled.add5.div5)
+  ; root instruction 2
+  %iv.scaled.add6 = add nuw nsw i32 %iv.scaled, 6
+  %iv.scaled.add6.div5 = udiv i32 %iv.scaled.add6, 5
+  tail call void @bar(i32 %iv.scaled.add6.div5)
+
+  ; loop-increment
+  %iv.next = add nuw nsw i32 %iv, 1
+
+  ; latch
+  %cmp = icmp ult i32 %iv.next, 3
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define dso_local void @unrerollable3() {
+entry:
+  br label %loop
+
+loop:
+
+; CHECK-LABEL: loop:
+; CHECK-NEXT:    %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    %iv.mul3 = mul nuw nsw i32 %iv, 3
+; CHECK-NEXT:    %iv.scaled = add nuw nsw i32 %iv.mul3, 20
+; CHECK-NEXT:    %iv.mul7 = mul nuw nsw i32 %iv, 7
+; CHECK-NEXT:    tail call void @bar(i32 %iv.mul7)
+
+  ; induction variable
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+
+  ; scale instruction
+  %iv.mul3 = mul nuw nsw i32 %iv, 3
+
+  ; extra simple arithmetic operations, used by root instructions
+  %iv.scaled = add nuw nsw i32 %iv.mul3, 20
+
+  ; unrerollable instructions using %iv
+  %iv.mul7 = mul nuw nsw i32 %iv, 7
+  tail call void @bar(i32 %iv.mul7)
+
+  ; root set 1
+
+  ; base instruction
+  %iv.scaled.div5 = udiv i32 %iv.scaled, 5
+  tail call void @bar(i32 %iv.scaled.div5)
+  ; root instruction 0
+  %iv.scaled.add1 = add nuw nsw i32 %iv.scaled, 1
+  %iv.scaled.add1.div5 = udiv i32 %iv.scaled.add1, 5
+  tail call void @bar(i32 %iv.scaled.add1.div5)
+  ; root instruction 2
+  %iv.scaled.add2 = add nuw nsw i32 %iv.scaled, 2
+  %iv.scaled.add2.div5 = udiv i32 %iv.scaled.add2, 5
+  tail call void @bar(i32 %iv.scaled.add2.div5)
+
+  ; root set 2
+
+  ; base instruction
+  %iv.scaled.add4 = add nuw nsw i32 %iv.scaled, 4
+  %iv.scaled.add4.div5 = udiv i32 %iv.scaled.add4, 5
+  tail call void @bar(i32 %iv.scaled.add4.div5)
+  ; root instruction 0
+  %iv.scaled.add5 = add nuw nsw i32 %iv.scaled, 5
+  %iv.scaled.add5.div5 = udiv i32 %iv.scaled.add5, 5
+  tail call void @bar(i32 %iv.scaled.add5.div5)
+  ; root instruction 2
+  %iv.scaled.add6 = add nuw nsw i32 %iv.scaled, 6
+  %iv.scaled.add6.div5 = udiv i32 %iv.scaled.add6, 5
+  tail call void @bar(i32 %iv.scaled.add6.div5)
+
+  ; loop-increment
+  %iv.next = add nuw nsw i32 %iv, 1
+
+  ; latch
+  %cmp = icmp ult i32 %iv.next, 3
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+declare dso_local void @bar(i32)

From 225b775620c69fa87c5fd63b29ef8f08c9922fb4 Mon Sep 17 00:00:00 2001
From: Ahsan Saghir <saghir@ca.ibm.com>
Date: Fri, 23 Apr 2021 08:26:45 -0500
Subject: [PATCH 230/318] [PowerPC] Prevent argument promotion of types with
 size greater than 128 bits

This patch prevents argument promotion of types having
type size greater than 128 bits.

Fixes Bugzilla: https://bugs.llvm.org/show_bug.cgi?id=49952

Reviewed By: #powerpc, nemanjai

Differential Revision: https://reviews.llvm.org/D101188

(cherry picked from commit 670736a904746e92dde141266b6d4881b56d51a2)
---
 .../Target/PowerPC/PPCTargetTransformInfo.cpp |  21 ++++
 .../Target/PowerPC/PPCTargetTransformInfo.h   |   3 +
 llvm/test/CodeGen/PowerPC/arg_promotion.ll    | 108 ++++++++++++++++++
 3 files changed, 132 insertions(+)
 create mode 100644 llvm/test/CodeGen/PowerPC/arg_promotion.ll

diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index b3d8100fe016..c90ff8b7d59d 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -1212,6 +1212,27 @@ unsigned PPCTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
 }
 
+bool PPCTTIImpl::areFunctionArgsABICompatible(
+    const Function *Caller, const Function *Callee,
+    SmallPtrSetImpl<Argument *> &Args) const {
+
+  // We need to ensure that argument promotion does not
+  // attempt to promote pointers to MMA types (__vector_pair
+  // and __vector_quad) since these types explicitly cannot be
+  // passed as arguments. Both of these types are larger than
+  // the 128-bit Altivec vectors and have a scalar size of 1 bit.
+  if (!BaseT::areFunctionArgsABICompatible(Caller, Callee, Args))
+    return false;
+
+  return llvm::none_of(Args, [](Argument *A) {
+    auto *EltTy = cast<PointerType>(A->getType())->getElementType();
+    if (EltTy->isSized())
+      return (EltTy->isIntOrIntVectorTy(1) &&
+              EltTy->getPrimitiveSizeInBits() > 128);
+    return false;
+  });
+}
+
 bool PPCTTIImpl::canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE,
                             LoopInfo *LI, DominatorTree *DT,
                             AssumptionCache *AC, TargetLibraryInfo *LibInfo) {
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index bc946715156f..c38ae90bc7dc 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -129,6 +129,9 @@ class PPCTTIImpl : public BasicTTIImplBase<PPCTTIImpl> {
   unsigned getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                  TTI::TargetCostKind CostKind);
 
+  bool areFunctionArgsABICompatible(const Function *Caller,
+                                    const Function *Callee,
+                                    SmallPtrSetImpl<Argument *> &Args) const;
   /// @}
 };
 
diff --git a/llvm/test/CodeGen/PowerPC/arg_promotion.ll b/llvm/test/CodeGen/PowerPC/arg_promotion.ll
new file mode 100644
index 000000000000..e52d2e47201f
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/arg_promotion.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -argpromotion -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
+; RUN: opt -S -passes=argpromotion -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
+
+; Test to check that we do not promote arguments when the
+; type size is greater than 128 bits.
+
+define internal fastcc void @print_acc(<512 x i1>* nocapture readonly %a) nounwind {
+; CHECK-LABEL: @print_acc(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, <512 x i1>* [[A:%.*]], align 64
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP1]], 0
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load <512 x i1>, <512 x i1>* %a, align 64
+  %1 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %0)
+  %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 0
+  ret void
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1>) nounwind
+
+define dso_local void @test(<512 x i1>* nocapture %a, <16 x i8> %ac) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf32ger(<16 x i8> [[AC:%.*]], <16 x i8> [[AC]])
+; CHECK-NEXT:    store <512 x i1> [[TMP0]], <512 x i1>* [[A:%.*]], align 64
+; CHECK-NEXT:    tail call fastcc void @print_acc(<512 x i1>* nonnull [[A]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = tail call <512 x i1> @llvm.ppc.mma.xvf32ger(<16 x i8> %ac, <16 x i8> %ac)
+  store <512 x i1> %0, <512 x i1>* %a, align 64
+  tail call fastcc void @print_acc(<512 x i1>* nonnull %a)
+  ret void
+}
+
+declare <512 x i1> @llvm.ppc.mma.xvf32ger(<16 x i8>, <16 x i8>) nounwind
+
+@.str = private unnamed_addr constant [11 x i8] c"Vector: { \00", align 1
+@.str.1 = private unnamed_addr constant [5 x i8] c"%d, \00", align 1
+@.str.2 = private unnamed_addr constant [6 x i8] c"%d }\0A\00", align 1
+
+define internal fastcc void @printWideVec(<16 x i32> %ptr.val) nounwind {
+; CHECK-LABEL: @printWideVec(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call signext i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i64 0, i64 0))
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <16 x i32> [[PTR_VAL:%.*]], i32 0
+; CHECK-NEXT:    [[CALL1:%.*]] = tail call signext i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([5 x i8], [5 x i8]* @.str.1, i64 0, i64 0), i32 signext [[VECEXT]])
+; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <16 x i32> [[PTR_VAL]], i32 1
+; CHECK-NEXT:    [[CALL1_1:%.*]] = tail call signext i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([5 x i8], [5 x i8]* @.str.1, i64 0, i64 0), i32 signext [[VECEXT_1]])
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <16 x i32> [[PTR_VAL]], i32 2
+; CHECK-NEXT:    [[CALL1_2:%.*]] = tail call signext i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([5 x i8], [5 x i8]* @.str.1, i64 0, i64 0), i32 signext [[VECEXT_2]])
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <16 x i32> [[PTR_VAL]], i32 3
+; CHECK-NEXT:    [[CALL1_3:%.*]] = tail call signext i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([5 x i8], [5 x i8]* @.str.1, i64 0, i64 0), i32 signext [[VECEXT_3]])
+; CHECK-NEXT:    [[VECEXT_4:%.*]] = extractelement <16 x i32> [[PTR_VAL]], i32 4
+; CHECK-NEXT:    [[CALL1_4:%.*]] = tail call signext i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([5 x i8], [5 x i8]* @.str.1, i64 0, i64 0), i32 signext [[VECEXT_4]])
+; CHECK-NEXT:    [[VECEXT_5:%.*]] = extractelement <16 x i32> [[PTR_VAL]], i32 5
+; CHECK-NEXT:    [[CALL1_5:%.*]] = tail call signext i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([5 x i8], [5 x i8]* @.str.1, i64 0, i64 0), i32 signext [[VECEXT_5]])
+; CHECK-NEXT:    [[VECEXT_6:%.*]] = extractelement <16 x i32> [[PTR_VAL]], i32 6
+; CHECK-NEXT:    [[CALL1_6:%.*]] = tail call signext i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([5 x i8], [5 x i8]* @.str.1, i64 0, i64 0), i32 signext [[VECEXT_6]])
+; CHECK-NEXT:    [[VECEXT2:%.*]] = extractelement <16 x i32> [[PTR_VAL]], i32 7
+; CHECK-NEXT:    [[CALL3:%.*]] = tail call signext i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([6 x i8], [6 x i8]* @.str.2, i64 0, i64 0), i32 signext [[VECEXT2]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %call = tail call signext i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i64 0, i64 0))
+  %vecext = extractelement <16 x i32> %ptr.val, i32 0
+  %call1 = tail call signext i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([5 x i8], [5 x i8]* @.str.1, i64 0, i64 0), i32 signext %vecext)
+  %vecext.1 = extractelement <16 x i32> %ptr.val, i32 1
+  %call1.1 = tail call signext i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([5 x i8], [5 x i8]* @.str.1, i64 0, i64 0), i32 signext %vecext.1)
+  %vecext.2 = extractelement <16 x i32> %ptr.val, i32 2
+  %call1.2 = tail call signext i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([5 x i8], [5 x i8]* @.str.1, i64 0, i64 0), i32 signext %vecext.2)
+  %vecext.3 = extractelement <16 x i32> %ptr.val, i32 3
+  %call1.3 = tail call signext i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([5 x i8], [5 x i8]* @.str.1, i64 0, i64 0), i32 signext %vecext.3)
+  %vecext.4 = extractelement <16 x i32> %ptr.val, i32 4
+  %call1.4 = tail call signext i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([5 x i8], [5 x i8]* @.str.1, i64 0, i64 0), i32 signext %vecext.4)
+  %vecext.5 = extractelement <16 x i32> %ptr.val, i32 5
+  %call1.5 = tail call signext i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([5 x i8], [5 x i8]* @.str.1, i64 0, i64 0), i32 signext %vecext.5)
+  %vecext.6 = extractelement <16 x i32> %ptr.val, i32 6
+  %call1.6 = tail call signext i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([5 x i8], [5 x i8]* @.str.1, i64 0, i64 0), i32 signext %vecext.6)
+  %vecext2 = extractelement <16 x i32> %ptr.val, i32 7
+  %call3 = tail call signext i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([6 x i8], [6 x i8]* @.str.2, i64 0, i64 0), i32 signext %vecext2)
+  ret void
+}
+
+declare noundef signext i32 @printf(i8* nocapture noundef readonly, ...) nounwind
+
+define dso_local void @test1(<4 x i32> %a, <4 x i32> %b) nounwind {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[VECINIT22:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    tail call fastcc void @printWideVec(<16 x i32> [[VECINIT22]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = shufflevector <4 x i32> %a, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %1 = shufflevector <4 x i32> %b, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2 = shufflevector <16 x i32> %0, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %vecinit22 = shufflevector <16 x i32> %2, <16 x i32> %1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  tail call fastcc void @printWideVec(<16 x i32> %vecinit22)
+  ret void
+}

From c89d50033228953d29e835ea5cb8e7066c0d8583 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 5 Apr 2021 12:03:50 -0400
Subject: [PATCH 231/318] [InstCombine] add test for miscompile from select
 value equivalence; NFC

The new test is reduced from:
https://llvm.org/PR49832
...but we already show a potential miscompile in the existing test too.

(cherry picked from commit c0b0da4684908b8e8143c0762fc766c1a2a5849f)
---
 .../Transforms/InstCombine/select-binop-cmp.ll     | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/select-binop-cmp.ll b/llvm/test/Transforms/InstCombine/select-binop-cmp.ll
index bbf7456ae811..beea2862a8ef 100644
--- a/llvm/test/Transforms/InstCombine/select-binop-cmp.ll
+++ b/llvm/test/Transforms/InstCombine/select-binop-cmp.ll
@@ -551,6 +551,8 @@ define i32 @select_xor_icmp_bad_6(i32 %x, i32 %y, i32 %z) {
   ret i32 %C
 }
 
+; FIXME: Value equivalence substitution is all-or-nothing, so needs a scalar compare.
+
 define <2 x i8> @select_xor_icmp_vec_bad(<2 x i8> %x, <2 x i8> %y, <2 x i8> %z) {
 ; CHECK-LABEL: @select_xor_icmp_vec_bad(
 ; CHECK-NEXT:    [[A:%.*]] = icmp eq <2 x i8> [[X:%.*]], <i8 5, i8 3>
@@ -564,6 +566,18 @@ define <2 x i8> @select_xor_icmp_vec_bad(<2 x i8> %x, <2 x i8> %y, <2 x i8> %z)
   ret <2 x i8>  %C
 }
 
+; FIXME: Value equivalence substitution is all-or-nothing, so needs a scalar compare.
+
+define <2 x i32> @vec_select_no_equivalence(<2 x i32> %x) {
+; CHECK-LABEL: @vec_select_no_equivalence(
+; CHECK-NEXT:    ret <2 x i32> [[X:%.*]]
+;
+  %x10 = shufflevector <2 x i32> %x, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+  %cond = icmp eq <2 x i32> %x, zeroinitializer
+  %s = select <2 x i1> %cond, <2 x i32> %x10, <2 x i32> %x
+  ret <2 x i32> %s
+}
+
 ; Folding this would only be legal if we sanitized undef to 0.
 define <2 x i8> @select_xor_icmp_vec_undef(<2 x i8> %x, <2 x i8> %y, <2 x i8> %z) {
 ; CHECK-LABEL: @select_xor_icmp_vec_undef(

From 4a12f51ad0090c3bcfea29c8dd021486ac3aa329 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 5 Apr 2021 12:14:49 -0400
Subject: [PATCH 232/318] [InstCombine] fix potential miscompile in select
 value equivalence

As shown in the example based on:
https://llvm.org/PR49832
...and the existing test, we can't substitute
a vector value because the equality compare
replacement that we are attempting requires
that the comparison is true for the entire
value. Vector select can be partly true/false.

(cherry picked from commit c590a9880d7a660a1c911fce07f3d01ea18be2df)
---
 llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp |  5 ++++-
 llvm/test/Transforms/InstCombine/select-binop-cmp.ll  | 11 +++++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index f26c194d31b9..5f174aae09ec 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1095,7 +1095,10 @@ static Instruction *canonicalizeAbsNabs(SelectInst &Sel, ICmpInst &Cmp,
 /// TODO: Wrapping flags could be preserved in some cases with better analysis.
 Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel,
                                                           ICmpInst &Cmp) {
-  if (!Cmp.isEquality())
+  // Value equivalence substitution requires an all-or-nothing replacement.
+  // It does not make sense for a vector compare where each lane is chosen
+  // independently.
+  if (!Cmp.isEquality() || Cmp.getType()->isVectorTy())
     return nullptr;
 
   // Canonicalize the pattern to ICMP_EQ by swapping the select operands.
diff --git a/llvm/test/Transforms/InstCombine/select-binop-cmp.ll b/llvm/test/Transforms/InstCombine/select-binop-cmp.ll
index beea2862a8ef..7c1cc21b4280 100644
--- a/llvm/test/Transforms/InstCombine/select-binop-cmp.ll
+++ b/llvm/test/Transforms/InstCombine/select-binop-cmp.ll
@@ -551,12 +551,12 @@ define i32 @select_xor_icmp_bad_6(i32 %x, i32 %y, i32 %z) {
   ret i32 %C
 }
 
-; FIXME: Value equivalence substitution is all-or-nothing, so needs a scalar compare.
+; Value equivalence substitution is all-or-nothing, so needs a scalar compare.
 
 define <2 x i8> @select_xor_icmp_vec_bad(<2 x i8> %x, <2 x i8> %y, <2 x i8> %z) {
 ; CHECK-LABEL: @select_xor_icmp_vec_bad(
 ; CHECK-NEXT:    [[A:%.*]] = icmp eq <2 x i8> [[X:%.*]], <i8 5, i8 3>
-; CHECK-NEXT:    [[B:%.*]] = xor <2 x i8> [[Z:%.*]], <i8 5, i8 3>
+; CHECK-NEXT:    [[B:%.*]] = xor <2 x i8> [[X]], [[Z:%.*]]
 ; CHECK-NEXT:    [[C:%.*]] = select <2 x i1> [[A]], <2 x i8> [[B]], <2 x i8> [[Y:%.*]]
 ; CHECK-NEXT:    ret <2 x i8> [[C]]
 ;
@@ -566,11 +566,14 @@ define <2 x i8> @select_xor_icmp_vec_bad(<2 x i8> %x, <2 x i8> %y, <2 x i8> %z)
   ret <2 x i8>  %C
 }
 
-; FIXME: Value equivalence substitution is all-or-nothing, so needs a scalar compare.
+; Value equivalence substitution is all-or-nothing, so needs a scalar compare.
 
 define <2 x i32> @vec_select_no_equivalence(<2 x i32> %x) {
 ; CHECK-LABEL: @vec_select_no_equivalence(
-; CHECK-NEXT:    ret <2 x i32> [[X:%.*]]
+; CHECK-NEXT:    [[X10:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq <2 x i32> [[X]], zeroinitializer
+; CHECK-NEXT:    [[S:%.*]] = select <2 x i1> [[COND]], <2 x i32> [[X10]], <2 x i32> [[X]]
+; CHECK-NEXT:    ret <2 x i32> [[S]]
 ;
   %x10 = shufflevector <2 x i32> %x, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
   %cond = icmp eq <2 x i32> %x, zeroinitializer

From 266c82f94da232d736f413c8d9e08d066c2d7202 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 5 Apr 2021 16:11:39 -0400
Subject: [PATCH 233/318] [InstSimplify] add test for vector select with
 operand replacement; NFC

We need a sibling fix to c590a9880d7a
( https://llvm.org/PR49832 ) to avoid miscompiling.

(cherry picked from commit 78e5cf66fec52c8e6e665c3c9e64d38498d94a5d)
---
 llvm/test/Transforms/InstSimplify/select.ll | 22 +++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/llvm/test/Transforms/InstSimplify/select.ll b/llvm/test/Transforms/InstSimplify/select.ll
index 6460b42d63c1..68e7411c8310 100644
--- a/llvm/test/Transforms/InstSimplify/select.ll
+++ b/llvm/test/Transforms/InstSimplify/select.ll
@@ -969,6 +969,28 @@ define <vscale x 2 x i1> @ignore_scalable_undef(<vscale x 2 x i1> %cond) {
   ret <vscale x 2 x i1> %s
 }
 
+define i32 @select_ctpop_zero(i32 %x) {
+; CHECK-LABEL: @select_ctpop_zero(
+; CHECK-NEXT:    [[T1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[X:%.*]])
+; CHECK-NEXT:    ret i32 [[T1]]
+;
+  %t0 = icmp eq i32 %x, 0
+  %t1 = call i32 @llvm.ctpop.i32(i32 %x)
+  %sel = select i1 %t0, i32 0, i32 %t1
+  ret i32 %sel
+}
+declare i32 @llvm.ctpop.i32(i32)
+
+define <2 x i32> @vec_select_no_equivalence(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @vec_select_no_equivalence(
+; CHECK-NEXT:    ret <2 x i32> zeroinitializer
+;
+  %x10 = shufflevector <2 x i32> %x, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+  %cond = icmp eq <2 x i32> %x, zeroinitializer
+  %s = select <2 x i1> %cond, <2 x i32> %x10, <2 x i32> zeroinitializer
+  ret <2 x i32> %s
+}
+
 ; TODO: these can be optimized more
 
 define i32 @poison(i32 %x, i32 %y) {

From 8e2ff387d30d540195ffef299785d392b0ee17dd Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 5 Apr 2021 16:47:29 -0400
Subject: [PATCH 234/318] [InstSimplify] fix potential miscompile in select
 value equivalence

This is the sibling fix to c590a9880d7a -
as there, we can't subsitute a vector value the equality
compare replacement that we are trying requires that the
comparison is true for the entire value. Vector select
can be partly true/false.

(cherry picked from commit e2a0f512eacad0699be9660f668726d7deb2cd75)
---
 llvm/lib/Analysis/InstructionSimplify.cpp   | 8 +++++---
 llvm/test/Transforms/InstSimplify/select.ll | 5 ++++-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index c40e5c36cdc7..a12816885c40 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -4127,10 +4127,12 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal,
                                               TrueVal, FalseVal))
     return V;
 
-  // If we have an equality comparison, then we know the value in one of the
-  // arms of the select. See if substituting this value into the arm and
+  // If we have a scalar equality comparison, then we know the value in one of
+  // the arms of the select. See if substituting this value into the arm and
   // simplifying the result yields the same value as the other arm.
-  if (Pred == ICmpInst::ICMP_EQ) {
+  // Note that the equivalence/replacement opportunity does not hold for vectors
+  // because each element of a vector select is chosen independently.
+  if (Pred == ICmpInst::ICMP_EQ && !CondVal->getType()->isVectorTy()) {
     if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q,
                                /* AllowRefinement */ false, MaxRecurse) ==
             TrueVal ||
diff --git a/llvm/test/Transforms/InstSimplify/select.ll b/llvm/test/Transforms/InstSimplify/select.ll
index 68e7411c8310..4291fc0a839e 100644
--- a/llvm/test/Transforms/InstSimplify/select.ll
+++ b/llvm/test/Transforms/InstSimplify/select.ll
@@ -983,7 +983,10 @@ declare i32 @llvm.ctpop.i32(i32)
 
 define <2 x i32> @vec_select_no_equivalence(<2 x i32> %x, <2 x i32> %y) {
 ; CHECK-LABEL: @vec_select_no_equivalence(
-; CHECK-NEXT:    ret <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[X10:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq <2 x i32> [[X]], zeroinitializer
+; CHECK-NEXT:    [[S:%.*]] = select <2 x i1> [[COND]], <2 x i32> [[X10]], <2 x i32> zeroinitializer
+; CHECK-NEXT:    ret <2 x i32> [[S]]
 ;
   %x10 = shufflevector <2 x i32> %x, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
   %cond = icmp eq <2 x i32> %x, zeroinitializer

From 372e6fbc8778911d743a1d8bf371bff5f805abea Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Mon, 5 Apr 2021 22:22:18 +0000
Subject: [PATCH 235/318] workflows: Use -g1 when compiling libraries for ABI
 checks

This should help reduce memory usage of the abi-dumper tool and avoid
running out of memory and disk space.
---
 .github/workflows/libclang-abi-tests.yml | 2 +-
 .github/workflows/llvm-tests.yml         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/libclang-abi-tests.yml b/.github/workflows/libclang-abi-tests.yml
index ed54c4a1e54d..af7c8d7db119 100644
--- a/.github/workflows/libclang-abi-tests.yml
+++ b/.github/workflows/libclang-abi-tests.yml
@@ -104,7 +104,7 @@ jobs:
     - name: Configure
       run: |
         mkdir install
-        cmake -B build -S llvm -G Ninja -DLLVM_ENABLE_PROJECTS=clang -DCMAKE_BUILD_TYPE=Debug -DLLVM_TARGETS_TO_BUILD="" -DLLVM_BUILD_LLVM_DYLIB=ON -DLLVM_LINK_LLVM_DYLIB=ON -DCMAKE_C_FLAGS_DEBUG="-g -Og" -DCMAKE_CXX_FLAGS_DEBUG="-g -Og" -DCMAKE_INSTALL_PREFIX=`pwd`/install llvm
+        cmake -B build -S llvm -G Ninja -DLLVM_ENABLE_PROJECTS=clang -DCMAKE_BUILD_TYPE=Debug -DLLVM_TARGETS_TO_BUILD="" -DLLVM_BUILD_LLVM_DYLIB=ON -DLLVM_LINK_LLVM_DYLIB=ON -DCMAKE_C_FLAGS_DEBUG="-g1 -Og" -DCMAKE_CXX_FLAGS_DEBUG="-g1 -Og" -DCMAKE_INSTALL_PREFIX=`pwd`/install llvm
     - name: Build
       run: ninja -C build/ ${{ needs.abi-dump-setup.outputs.ABI_LIBS }} install-clang-headers
     - name: Dump ABI
diff --git a/.github/workflows/llvm-tests.yml b/.github/workflows/llvm-tests.yml
index 9017a014be02..077def0250fb 100644
--- a/.github/workflows/llvm-tests.yml
+++ b/.github/workflows/llvm-tests.yml
@@ -107,7 +107,7 @@ jobs:
     - name: Configure
       run: |
         mkdir install
-        cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=Debug -DLLVM_TARGETS_TO_BUILD="" -DLLVM_BUILD_LLVM_DYLIB=ON -DCMAKE_C_FLAGS_DEBUG="-g -Og" -DCMAKE_CXX_FLAGS_DEBUG="-g -Og" -DCMAKE_INSTALL_PREFIX=`pwd`/install llvm
+        cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=Debug -DLLVM_TARGETS_TO_BUILD="" -DLLVM_BUILD_LLVM_DYLIB=ON -DCMAKE_C_FLAGS_DEBUG="-g1 -Og" -DCMAKE_CXX_FLAGS_DEBUG="-g1 -Og" -DCMAKE_INSTALL_PREFIX=`pwd`/install llvm
     - name: Build
       # Need to run install-LLVM twice to ensure the symlink is installed (this is a bug).
       run: |

From 452500ebcde0443bc421eda407cedc888bfd8fca Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Tue, 6 Apr 2021 17:19:56 +0000
Subject: [PATCH 236/318] workflows: Use uncompressed abi dumps

The compressed dumps are no longer readable, but I'm not sure why.
---
 .github/workflows/libclang-abi-tests.yml | 5 ++---
 .github/workflows/llvm-tests.yml         | 8 +++++---
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/libclang-abi-tests.yml b/.github/workflows/libclang-abi-tests.yml
index af7c8d7db119..09ea5cfbfc82 100644
--- a/.github/workflows/libclang-abi-tests.yml
+++ b/.github/workflows/libclang-abi-tests.yml
@@ -113,13 +113,12 @@ jobs:
         for lib in ${{ needs.abi-dump-setup.outputs.ABI_LIBS }}; do
           # Remove symbol versioning from dumps, so we can compare across major versions.
           sed -i 's/LLVM_${{ matrix.llvm_version_major }}/LLVM_NOVERSION/' $lib-${{ matrix.ref }}.abi
-          tar -czf $lib-${{ matrix.ref }}.abi.tar.gz $lib-${{ matrix.ref }}.abi
         done
     - name: Upload ABI file
       uses: actions/upload-artifact@v2
       with:
         name: ${{ matrix.name }}
-        path: "*${{ matrix.ref }}.abi.tar.gz"
+        path: "*${{ matrix.ref }}.abi"
 
   abi-compare:
     runs-on: ubuntu-latest
@@ -141,7 +140,7 @@ jobs:
       - name: Compare ABI
         run: |
           for lib in ${{ needs.abi-dump-setup.outputs.ABI_LIBS }}; do
-            abi-compliance-checker -lib $lib -old build-baseline/$lib*.abi.tar.gz -new build-latest/$lib*.abi.tar.gz
+            abi-compliance-checker -lib $lib -old build-baseline/$lib*.abi -new build-latest/$lib*.abi
           done
       - name: Upload ABI Comparison
         if: always()
diff --git a/.github/workflows/llvm-tests.yml b/.github/workflows/llvm-tests.yml
index 077def0250fb..bb011c0dafbb 100644
--- a/.github/workflows/llvm-tests.yml
+++ b/.github/workflows/llvm-tests.yml
@@ -126,12 +126,11 @@ jobs:
         abi-dumper $EXTRA_ARGS -lver ${{ matrix.ref }} -skip-cxx -public-headers ./install/include/${{ needs.abi-dump-setup.outputs.ABI_HEADERS }} -o ${{ matrix.ref }}.abi ./install/lib/libLLVM.so
         # Remove symbol versioning from dumps, so we can compare across major versions.
         sed -i 's/LLVM_${{ matrix.llvm_version_major }}/LLVM_NOVERSION/' ${{ matrix.ref }}.abi
-        tar -czf ${{ matrix.ref }}.abi.tar.gz ${{ matrix.ref }}.abi
     - name: Upload ABI file
       uses: actions/upload-artifact@v1
       with:
         name: ${{ matrix.name }}
-        path: ${{ matrix.ref }}.abi.tar.gz
+        path: ${{ matrix.ref }}.abi
 
     - name: Upload symbol list file
       if: matrix.name == 'build-baseline'
@@ -167,7 +166,10 @@ jobs:
             # This option doesn't seem to work with the ABI dumper, so passing it here.
             export EXTRA_ARGS="-symbols-list symbol-list/llvm.symbols"
           fi
-          abi-compliance-checker $EXTRA_ARGS -l libLLVM.so -old build-baseline/*.tar.gz -new build-latest/*.tar.gz || test "${{ needs.abi-dump-setup.outputs.ABI_HEADERS }}" = "llvm-c"
+          # FIXME: Reading of gzip'd abi files on the GitHub runners stop
+          # working some time in March of 2021, likely due to a change in the
+          # runner's environment.
+          abi-compliance-checker $EXTRA_ARGS -l libLLVM.so -old build-baseline/*.abi -new build-latest/*.abi || test "${{ needs.abi-dump-setup.outputs.ABI_HEADERS }}" = "llvm-c"
       - name: Upload ABI Comparison
         if: always()
         uses: actions/upload-artifact@v1

From a1a197b54ec6686963b4e56ee8117dd79679ec4a Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Tue, 4 May 2021 03:50:40 +0000
Subject: [PATCH 237/318] workflows: Fix tarball download for
 libclang-abi-tests

---
 .github/workflows/libclang-abi-tests.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/libclang-abi-tests.yml b/.github/workflows/libclang-abi-tests.yml
index 09ea5cfbfc82..c7d1993ba006 100644
--- a/.github/workflows/libclang-abi-tests.yml
+++ b/.github/workflows/libclang-abi-tests.yml
@@ -41,7 +41,7 @@ jobs:
           remote_repo='https://github.com/llvm/llvm-project'
           if [ ${{ steps.version.outputs.LLVM_VERSION_MINOR }} -ne 0 -o ${{ steps.version.outputs.LLVM_VERSION_PATCH }} -eq 0 ]; then
             major_version=$(( ${{ steps.version.outputs.LLVM_VERSION_MAJOR }} - 1))
-            baseline_ref="$major_version.0.0"
+            baseline_ref="llvmorg-$major_version.0.0"
 
             # If there is a minor release, we want to use that as the base line.
             minor_ref=`git ls-remote --refs -t $remote_repo llvmorg-$major_version.[1-9].[0-9] | tail -n1 | grep -o 'llvmorg-.\+' || true`
@@ -60,7 +60,7 @@ jobs:
             echo ::set-output name=ABI_LIBS::libclang.so
           else
             echo ::set-output name=BASELINE_VERSION_MAJOR::${{ steps.version.outputs.LLVM_VERSION_MAJOR }}
-            echo ::set-output name=BASELINE_REF::${{ steps.version.outputs.LLVM_VERSION_MAJOR }}.0.0
+            echo ::set-output name=BASELINE_REF::llvmorg-${{ steps.version.outputs.LLVM_VERSION_MAJOR }}.0.0
             echo ::set-output name=ABI_HEADERS::.
             echo ::set-output name=ABI_LIBS::libclang.so libclang-cpp.so
           fi

From 2db5d42193abefcb41b10bd70b7ab536cb03f1cc Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Mon, 10 May 2021 13:20:52 -0700
Subject: [PATCH 238/318] Remove extra test case added in
 266c82f94da232d736f413c8d9e08d066c2d7202

This test case was added by accident and is failing on the release/12.x branch.
---
 llvm/test/Transforms/InstSimplify/select.ll | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/llvm/test/Transforms/InstSimplify/select.ll b/llvm/test/Transforms/InstSimplify/select.ll
index 4291fc0a839e..93f09d89bf40 100644
--- a/llvm/test/Transforms/InstSimplify/select.ll
+++ b/llvm/test/Transforms/InstSimplify/select.ll
@@ -969,18 +969,6 @@ define <vscale x 2 x i1> @ignore_scalable_undef(<vscale x 2 x i1> %cond) {
   ret <vscale x 2 x i1> %s
 }
 
-define i32 @select_ctpop_zero(i32 %x) {
-; CHECK-LABEL: @select_ctpop_zero(
-; CHECK-NEXT:    [[T1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[X:%.*]])
-; CHECK-NEXT:    ret i32 [[T1]]
-;
-  %t0 = icmp eq i32 %x, 0
-  %t1 = call i32 @llvm.ctpop.i32(i32 %x)
-  %sel = select i1 %t0, i32 0, i32 %t1
-  ret i32 %sel
-}
-declare i32 @llvm.ctpop.i32(i32)
-
 define <2 x i32> @vec_select_no_equivalence(<2 x i32> %x, <2 x i32> %y) {
 ; CHECK-LABEL: @vec_select_no_equivalence(
 ; CHECK-NEXT:    [[X10:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>

From f3e07c841e2f96a73b253d3b1a95e2ac8df5a376 Mon Sep 17 00:00:00 2001
From: Joachim Meyer <joachim@joameyer.de>
Date: Thu, 6 May 2021 22:26:19 +0200
Subject: [PATCH 239/318] [NFC] Correctly assert the indents for
 printEnumValHelpStr.

Only verify that there's no negative indent.
Noted by @chapuni in https://reviews.llvm.org/D93494.

Reviewed By: chapuni

Differential Revision: https://reviews.llvm.org/D102021

(cherry picked from commit d9f2960c932c9803e662098e33d899efa3c67f44)
---
 llvm/lib/Support/CommandLine.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp
index e2f014d1815b..123a23a5242c 100644
--- a/llvm/lib/Support/CommandLine.cpp
+++ b/llvm/lib/Support/CommandLine.cpp
@@ -1729,7 +1729,7 @@ void Option::printHelpStr(StringRef HelpStr, size_t Indent,
 void Option::printEnumValHelpStr(StringRef HelpStr, size_t BaseIndent,
                                  size_t FirstLineIndentedBy) {
   const StringRef ValHelpPrefix = "  ";
-  assert(BaseIndent >= FirstLineIndentedBy + ValHelpPrefix.size());
+  assert(BaseIndent >= FirstLineIndentedBy);
   std::pair<StringRef, StringRef> Split = HelpStr.split('\n');
   outs().indent(BaseIndent - FirstLineIndentedBy)
       << ArgHelpPrefix << ValHelpPrefix << Split.first << "\n";

From 24535af52ae139f2bb361855fbbaf47cc9e5d580 Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara@apple.com>
Date: Fri, 9 Apr 2021 10:39:59 -0700
Subject: [PATCH 240/318] [AArch64][GlobalISel] Fix incorrect codegen for <16 x
 s8> G_ASHR.

Fixes PR49904

(cherry picked from commit 40e75cafc0fef365b5580a9c09595ac475db0c19)
---
 llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp | 2 +-
 llvm/test/CodeGen/AArch64/GlobalISel/select-vector-shift.mir | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 5259f4f5a4d0..fc5ef02e8457 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -1791,7 +1791,7 @@ bool AArch64InstructionSelector::selectVectorAshrLshr(
     NegOpc = AArch64::NEGv8i16;
   } else if (Ty == LLT::vector(16, 8)) {
     Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8;
-    NegOpc = AArch64::NEGv8i16;
+    NegOpc = AArch64::NEGv16i8;
   } else if (Ty == LLT::vector(8, 8)) {
     Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8;
     NegOpc = AArch64::NEGv8i8;
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-vector-shift.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-vector-shift.mir
index 6a5c33ed9c14..1056a449ab21 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-vector-shift.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-vector-shift.mir
@@ -562,8 +562,8 @@ body:             |
     ; CHECK: liveins: $q0, $q1
     ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
     ; CHECK: [[COPY1:%[0-9]+]]:fpr128 = COPY $q1
-    ; CHECK: [[NEGv8i16_:%[0-9]+]]:fpr128 = NEGv8i16 [[COPY1]]
-    ; CHECK: [[USHLv16i8_:%[0-9]+]]:fpr128 = USHLv16i8 [[COPY]], [[NEGv8i16_]]
+    ; CHECK: [[NEGv16i8_:%[0-9]+]]:fpr128 = NEGv16i8 [[COPY1]]
+    ; CHECK: [[USHLv16i8_:%[0-9]+]]:fpr128 = USHLv16i8 [[COPY]], [[NEGv16i8_]]
     ; CHECK: $q0 = COPY [[USHLv16i8_]]
     ; CHECK: RET_ReallyLR implicit $q0
     %0:fpr(<16 x s8>) = COPY $q0

From 067c06dc8395a2d79792c0ac4e48c2a79836b46f Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulsson@linux.vnet.ibm.com>
Date: Thu, 6 May 2021 15:22:21 +0200
Subject: [PATCH 241/318] [SystemZ] Don't use libcall for 128 bit shifts.

Expand 128 bit shifts instead of using a libcall.

This patch removes the 128 bit shift libcalls and thereby causes
ExpandShiftWithUnknownAmountBit() to be called.

Review: Ulrich Weigand

Differential Revision: https://reviews.llvm.org/D101993

(cherry picked from commit 1c4cb510b4daccc0f4763958567affc2b442f317)
---
 .../Target/SystemZ/SystemZISelLowering.cpp    |  5 +-
 llvm/test/CodeGen/SystemZ/shift-12.ll         | 94 +++++++++++++++----
 2 files changed, 79 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 603446755aaf..9ace36f344a5 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -285,10 +285,13 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   // Give LowerOperation the chance to replace 64-bit ORs with subregs.
   setOperationAction(ISD::OR, MVT::i64, Custom);
 
-  // FIXME: Can we support these natively?
+  // Expand 128 bit shifts without using a libcall.
   setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
   setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
   setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
+  setLibcallName(RTLIB::SRL_I128, nullptr);
+  setLibcallName(RTLIB::SHL_I128, nullptr);
+  setLibcallName(RTLIB::SRA_I128, nullptr);
 
   // We have native instructions for i8, i16 and i32 extensions, but not i1.
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
diff --git a/llvm/test/CodeGen/SystemZ/shift-12.ll b/llvm/test/CodeGen/SystemZ/shift-12.ll
index 7559602aa256..421928f28698 100644
--- a/llvm/test/CodeGen/SystemZ/shift-12.ll
+++ b/llvm/test/CodeGen/SystemZ/shift-12.ll
@@ -2,7 +2,7 @@
 ; Test removal of AND operations that don't affect last 6 bits of shift amount
 ; operand.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
 
 ; Test that AND is not removed when some lower 6 bits are not set.
 define i32 @f1(i32 %a, i32 %sh) {
@@ -119,31 +119,28 @@ define i32 @f10(i32 %a, i32 %sh) {
   ret i32 %reuse
 }
 
-; Test that AND is not removed for i128 (which calls __ashlti3)
 define i128 @f11(i128 %a, i32 %sh) {
 ; CHECK-LABEL: f11:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    stmg %r13, %r15, 104(%r15)
-; CHECK-NEXT:    .cfi_offset %r13, -56
+; CHECK-NEXT:    stmg %r14, %r15, 112(%r15)
 ; CHECK-NEXT:    .cfi_offset %r14, -48
 ; CHECK-NEXT:    .cfi_offset %r15, -40
-; CHECK-NEXT:    aghi %r15, -192
-; CHECK-NEXT:    .cfi_def_cfa_offset 352
 ; CHECK-NEXT:    lg %r0, 8(%r3)
-; CHECK-NEXT:    # kill: def $r4l killed $r4l def $r4d
-; CHECK-NEXT:    lgr %r13, %r2
 ; CHECK-NEXT:    lg %r1, 0(%r3)
-; CHECK-NEXT:    stg %r0, 168(%r15)
-; CHECK-NEXT:    risbg %r4, %r4, 57, 191, 0
-; CHECK-NEXT:    la %r2, 176(%r15)
-; CHECK-NEXT:    la %r3, 160(%r15)
-; CHECK-NEXT:    stg %r1, 160(%r15)
-; CHECK-NEXT:    brasl %r14, __ashlti3@PLT
-; CHECK-NEXT:    lg %r0, 184(%r15)
-; CHECK-NEXT:    lg %r1, 176(%r15)
-; CHECK-NEXT:    stg %r0, 8(%r13)
-; CHECK-NEXT:    stg %r1, 0(%r13)
-; CHECK-NEXT:    lmg %r13, %r15, 296(%r15)
+; CHECK-NEXT:    risblg %r3, %r4, 25, 159, 0
+; CHECK-NEXT:    lcr %r14, %r3
+; CHECK-NEXT:    sllg %r5, %r1, 0(%r4)
+; CHECK-NEXT:    srlg %r14, %r0, 0(%r14)
+; CHECK-NEXT:    ogr %r5, %r14
+; CHECK-NEXT:    sllg %r3, %r0, -64(%r3)
+; CHECK-NEXT:    tmll %r4, 127
+; CHECK-NEXT:    locgrle %r3, %r5
+; CHECK-NEXT:    sllg %r0, %r0, 0(%r4)
+; CHECK-NEXT:    locgre %r3, %r1
+; CHECK-NEXT:    locghinle %r0, 0
+; CHECK-NEXT:    stg %r0, 8(%r2)
+; CHECK-NEXT:    stg %r3, 0(%r2)
+; CHECK-NEXT:    lmg %r14, %r15, 112(%r15)
 ; CHECK-NEXT:    br %r14
   %and = and i32 %sh, 127
   %ext = zext i32 %and to i128
@@ -151,3 +148,62 @@ define i128 @f11(i128 %a, i32 %sh) {
   ret i128 %shift
 }
 
+define i128 @f12(i128 %a, i32 %sh) {
+; CHECK-LABEL: f12:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT:    .cfi_offset %r14, -48
+; CHECK-NEXT:    .cfi_offset %r15, -40
+; CHECK-NEXT:    lg %r0, 0(%r3)
+; CHECK-NEXT:    lg %r1, 8(%r3)
+; CHECK-NEXT:    risblg %r3, %r4, 25, 159, 0
+; CHECK-NEXT:    lcr %r14, %r3
+; CHECK-NEXT:    srlg %r5, %r1, 0(%r4)
+; CHECK-NEXT:    sllg %r14, %r0, 0(%r14)
+; CHECK-NEXT:    ogr %r5, %r14
+; CHECK-NEXT:    srlg %r3, %r0, -64(%r3)
+; CHECK-NEXT:    tmll %r4, 127
+; CHECK-NEXT:    locgrle %r3, %r5
+; CHECK-NEXT:    srlg %r0, %r0, 0(%r4)
+; CHECK-NEXT:    locgre %r3, %r1
+; CHECK-NEXT:    locghinle %r0, 0
+; CHECK-NEXT:    stg %r0, 0(%r2)
+; CHECK-NEXT:    stg %r3, 8(%r2)
+; CHECK-NEXT:    lmg %r14, %r15, 112(%r15)
+; CHECK-NEXT:    br %r14
+  %and = and i32 %sh, 127
+  %ext = zext i32 %and to i128
+  %shift = lshr i128 %a, %ext
+  ret i128 %shift
+}
+
+define i128 @f13(i128 %a, i32 %sh) {
+; CHECK-LABEL: f13:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT:    .cfi_offset %r14, -48
+; CHECK-NEXT:    .cfi_offset %r15, -40
+; CHECK-NEXT:    lg %r0, 0(%r3)
+; CHECK-NEXT:    lg %r1, 8(%r3)
+; CHECK-NEXT:    risblg %r3, %r4, 25, 159, 0
+; CHECK-NEXT:    lcr %r14, %r3
+; CHECK-NEXT:    srlg %r5, %r1, 0(%r4)
+; CHECK-NEXT:    sllg %r14, %r0, 0(%r14)
+; CHECK-NEXT:    ogr %r5, %r14
+; CHECK-NEXT:    srag %r14, %r0, 0(%r4)
+; CHECK-NEXT:    srag %r3, %r0, -64(%r3)
+; CHECK-NEXT:    srag %r0, %r0, 63
+; CHECK-NEXT:    tmll %r4, 127
+; CHECK-NEXT:    locgrle %r3, %r5
+; CHECK-NEXT:    locgre %r3, %r1
+; CHECK-NEXT:    locgrle %r0, %r14
+; CHECK-NEXT:    stg %r0, 0(%r2)
+; CHECK-NEXT:    stg %r3, 8(%r2)
+; CHECK-NEXT:    lmg %r14, %r15, 112(%r15)
+; CHECK-NEXT:    br %r14
+  %and = and i32 %sh, 127
+  %ext = zext i32 %and to i128
+  %shift = ashr i128 %a, %ext
+  ret i128 %shift
+}
+

From b89942c336a4506dabd5e1b2a6f1b5cbaddebe55 Mon Sep 17 00:00:00 2001
From: Alan Phipps <a-phipps@ti.com>
Date: Tue, 11 May 2021 11:40:11 -0500
Subject: [PATCH 242/318] [Coverage] Fix branch coverage merging in
 FunctionCoverageSummary::get() for instantiation

Fix branch coverage merging in FunctionCoverageSummary::get() for instantiation
groups.

This change corrects the implementation for the branch coverage summary to do
the same thing for branches that is done for lines and regions.  That is,
across function instantiations in an instantiation group, the maximum branch
coverage found in any of those instantiations is returned, with the total
number of branches being the same across instantiations.

Differential Revision: https://reviews.llvm.org/D102193

(cherry picked from commit eccb925147d5f262a3e74cc050d0665dd4e6d8db)
---
 llvm/test/tools/llvm-cov/branch-templates.cpp | 16 +++++++++++++++-
 llvm/tools/llvm-cov/CoverageSummaryInfo.cpp   |  6 +-----
 llvm/tools/llvm-cov/CoverageSummaryInfo.h     |  5 +++++
 3 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/llvm/test/tools/llvm-cov/branch-templates.cpp b/llvm/test/tools/llvm-cov/branch-templates.cpp
index 750dc7bd58f2..4797428f8835 100644
--- a/llvm/test/tools/llvm-cov/branch-templates.cpp
+++ b/llvm/test/tools/llvm-cov/branch-templates.cpp
@@ -1,9 +1,9 @@
 // RUN: llvm-profdata merge %S/Inputs/branch-templates.proftext -o %t.profdata
 // RUN: llvm-cov show --show-expansions --show-branches=count %S/Inputs/branch-templates.o32l -instr-profile %t.profdata -path-equivalence=/tmp,%S %s | FileCheck %s
 // RUN: llvm-cov report --show-branch-summary %S/Inputs/branch-templates.o32l -instr-profile %t.profdata -show-functions -path-equivalence=/tmp,%S %s | FileCheck %s -check-prefix=REPORT
+// RUN: llvm-cov report --show-branch-summary %S/Inputs/branch-templates.o32l -instr-profile %t.profdata -path-equivalence=/tmp,%S %s | FileCheck %s -check-prefix=REPORTFILE
 
 #include <stdio.h>
-
 template<typename T>
 void unused(T x) {
   return;
@@ -45,3 +45,17 @@ int main() {
 // REPORT-NEXT: _Z4funcIfEiT_                     5       2  60.00%         7       3  57.14%         2       1  50.00%
 // REPORT-NEXT: ---
 // REPORT-NEXT: TOTAL                            22       7  68.18%        31      11  64.52%        12       6  50.00%
+
+// Make sure the covered branch tally for the function instantiation group is
+// merged to reflect maximum branch coverage of a single instantiation, just
+// like what is done for lines and regions. Also, the total branch tally
+// summary for an instantiation group should agree with the total number of
+// branches in the definition (In this case, 2 and 6 for func<>() and main(),
+// respectively).  This is returned by: FunctionCoverageSummary::get(const
+// InstantiationGroup &Group, ...)
+
+// REPORTFILE: Filename                      Regions    Missed Regions     Cover   Functions  Missed Functions  Executed       Lines      Missed Lines     Cover    Branches   Missed Branches     Cover
+// REPORTFILE-NEXT: ---
+// REPORTFILE-NEXT: branch-templates.cpp          12                 3    75.00%           2                 0   100.00%          17                 4    76.47%           8                 4    50.00%
+// REPORTFILE-NEXT: ---
+// REPORTFILE-NEXT: TOTAL                              12                 3    75.00%           2                 0   100.00%          17                 4    76.47%           8                 4    50.00%
diff --git a/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp b/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp
index 4a0a86168908..10e059adeb7d 100644
--- a/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp
+++ b/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp
@@ -100,11 +100,7 @@ FunctionCoverageSummary::get(const InstantiationGroup &Group,
   for (const auto &FCS : Summaries.drop_front()) {
     Summary.RegionCoverage.merge(FCS.RegionCoverage);
     Summary.LineCoverage.merge(FCS.LineCoverage);
-
-    // Sum branch coverage across instantiation groups for the summary rather
-    // than "merge" the maximum count. This is a clearer view into whether all
-    // created branches are covered.
-    Summary.BranchCoverage += FCS.BranchCoverage;
+    Summary.BranchCoverage.merge(FCS.BranchCoverage);
   }
   return Summary;
 }
diff --git a/llvm/tools/llvm-cov/CoverageSummaryInfo.h b/llvm/tools/llvm-cov/CoverageSummaryInfo.h
index 4bc1c24a079f..62e7cad1012b 100644
--- a/llvm/tools/llvm-cov/CoverageSummaryInfo.h
+++ b/llvm/tools/llvm-cov/CoverageSummaryInfo.h
@@ -123,6 +123,11 @@ class BranchCoverageInfo {
     return *this;
   }
 
+  void merge(const BranchCoverageInfo &RHS) {
+    Covered = std::max(Covered, RHS.Covered);
+    NumBranches = std::max(NumBranches, RHS.NumBranches);
+  }
+
   size_t getCovered() const { return Covered; }
 
   size_t getNumBranches() const { return NumBranches; }

From aa97726f6040c68dfdd8076e8efe3ef119f6b037 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dimitry@andric.com>
Date: Sun, 14 Mar 2021 17:41:21 +0100
Subject: [PATCH 243/318] [SCCP] Avoid modifying AdditionalUsers while
 iterating over it

When run under valgrind, or with a malloc that poisons freed memory,
this can lead to segfaults or other problems.

To avoid modifying the AdditionalUsers DenseMap while still iterating,
save the instructions to be notified in a separate SmallPtrSet, and use
this to later call OperandChangedState on each instruction.

Fixes PR49582.

Reviewed By: fhahn

Differential Revision: https://reviews.llvm.org/D98602

(cherry picked from commit 6abb92f2103a58d097620b4410054c5bb18c48ec)
---
 llvm/lib/Transforms/Scalar/SCCP.cpp           |   7 +-
 .../SCCP/pr49582-iterator-invalidation.ll     | 854 ++++++++++++++++++
 2 files changed, 860 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/SCCP/pr49582-iterator-invalidation.ll

diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp
index de6be52adf21..8feed9e9ebfe 100644
--- a/llvm/lib/Transforms/Scalar/SCCP.cpp
+++ b/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -542,9 +542,14 @@ class SCCPSolver : public InstVisitor<SCCPSolver> {
 
     auto Iter = AdditionalUsers.find(I);
     if (Iter != AdditionalUsers.end()) {
+      // Copy additional users before notifying them of changes, because new
+      // users may be added, potentially invalidating the iterator.
+      SmallVector<Instruction *, 2> ToNotify;
       for (User *U : Iter->second)
         if (auto *UI = dyn_cast<Instruction>(U))
-          OperandChangedState(UI);
+          ToNotify.push_back(UI);
+      for (Instruction *UI : ToNotify)
+        OperandChangedState(UI);
     }
   }
   void handleCallOverdefined(CallBase &CB);
diff --git a/llvm/test/Transforms/SCCP/pr49582-iterator-invalidation.ll b/llvm/test/Transforms/SCCP/pr49582-iterator-invalidation.ll
new file mode 100644
index 000000000000..6d5b2e1841b4
--- /dev/null
+++ b/llvm/test/Transforms/SCCP/pr49582-iterator-invalidation.ll
@@ -0,0 +1,854 @@
+; RUN: opt < %s -ipsccp -disable-output
+; PR49582: This test checks for an iterator invalidation issue, which only gets
+; exposed on a large-enough test case. We intentionally do not check the output.
+
+@c = external dso_local global i32*, align 8
+@d = external dso_local global i32, align 4
+
+define void @f(i32 %i) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %if.end628, %entry
+  %e.0 = phi i32 [ 1, %entry ], [ %e.15, %if.end628 ]
+  %cmp = icmp slt i32 %e.0, %i
+  call void @llvm.assume(i1 %cmp)
+  %0 = load i32*, i32** @c, align 8
+  %tobool = icmp ne i32* %0, null
+  br i1 %tobool, label %if.then, label %if.end628
+
+if.then:                                          ; preds = %for.cond
+  %1 = load i32, i32* %0, align 4
+  %tobool1 = icmp ne i32 %1, 0
+  br i1 %tobool1, label %if.then2, label %if.else78
+
+if.then2:                                         ; preds = %if.then
+  %add = add nsw i32 %e.0, 1
+  %cmp3 = icmp sge i32 %add, %i
+  br i1 %cmp3, label %if.then4, label %if.end
+
+if.then4:                                         ; preds = %if.then2
+  %idxprom = sext i32 %add to i64
+  br label %if.end
+
+if.end:                                           ; preds = %if.then4, %if.then2
+  br i1 %cmp3, label %if.then9, label %if.end13
+
+if.then9:                                         ; preds = %if.end
+  %idxprom11 = sext i32 %add to i64
+  br label %if.end13
+
+if.end13:                                         ; preds = %if.then9, %if.end
+  br i1 %cmp3, label %if.then16, label %if.end20
+
+if.then16:                                        ; preds = %if.end13
+  %idxprom18 = sext i32 %add to i64
+  br label %if.end20
+
+if.end20:                                         ; preds = %if.then16, %if.end13
+  %add21 = add nsw i32 %e.0, 3
+  %cmp22 = icmp sge i32 %add21, %i
+  br i1 %cmp22, label %if.then23, label %if.end25
+
+if.then23:                                        ; preds = %if.end20
+  br label %if.end25
+
+if.end25:                                         ; preds = %if.then23, %if.end20
+  %e.1 = phi i32 [ %add21, %if.then23 ], [ %e.0, %if.end20 ]
+  %cmp26 = icmp sge i32 %e.1, %i
+  br i1 %cmp26, label %if.then27, label %if.end28
+
+if.then27:                                        ; preds = %if.end25
+  %inc = add nsw i32 %e.1, 1
+  br label %if.end28
+
+if.end28:                                         ; preds = %if.then27, %if.end25
+  %e.2 = phi i32 [ %inc, %if.then27 ], [ %e.1, %if.end25 ]
+  %add29 = add nsw i32 %e.2, 2
+  %cmp30 = icmp sge i32 %add29, %i
+  br i1 %cmp30, label %if.then31, label %if.end33
+
+if.then31:                                        ; preds = %if.end28
+  br label %if.end33
+
+if.end33:                                         ; preds = %if.then31, %if.end28
+  %e.3 = phi i32 [ %add29, %if.then31 ], [ %e.2, %if.end28 ]
+  %cmp34 = icmp sge i32 %e.3, %i
+  br i1 %cmp34, label %if.then35, label %if.end38
+
+if.then35:                                        ; preds = %if.end33
+  %idxprom36 = sext i32 %e.3 to i64
+  br label %if.end38
+
+if.end38:                                         ; preds = %if.then35, %if.end33
+  br i1 %cmp34, label %if.then40, label %if.end43
+
+if.then40:                                        ; preds = %if.end38
+  %idxprom41 = sext i32 %e.3 to i64
+  br label %if.end43
+
+if.end43:                                         ; preds = %if.then40, %if.end38
+  br i1 %cmp34, label %if.then45, label %if.end47
+
+if.then45:                                        ; preds = %if.end43
+  %inc46 = add nsw i32 %e.3, 1
+  br label %if.end47
+
+if.end47:                                         ; preds = %if.then45, %if.end43
+  %e.4 = phi i32 [ %inc46, %if.then45 ], [ %e.3, %if.end43 ]
+  %cmp48 = icmp sge i32 %e.4, %i
+  br i1 %cmp48, label %if.then49, label %if.end51
+
+if.then49:                                        ; preds = %if.end47
+  %inc50 = add nsw i32 %e.4, 1
+  br label %if.end51
+
+if.end51:                                         ; preds = %if.then49, %if.end47
+  %e.5 = phi i32 [ %inc50, %if.then49 ], [ %e.4, %if.end47 ]
+  %2 = load i32*, i32** @c, align 8
+  %tobool52 = icmp ne i32* %2, null
+  br i1 %tobool52, label %if.then53, label %if.else
+
+if.then53:                                        ; preds = %if.end51
+  %cmp54 = icmp sge i32 %e.5, %i
+  br i1 %cmp54, label %if.then55, label %if.end628
+
+if.then55:                                        ; preds = %if.then53
+  unreachable
+
+if.else:                                          ; preds = %if.end51
+  %3 = load i32, i32* @d, align 4
+  %tobool57 = icmp ne i32 %3, 0
+  br i1 %tobool57, label %if.then58, label %if.else68
+
+if.then58:                                        ; preds = %if.else
+  %cmp59 = icmp sge i32 %e.5, %i
+  br i1 %cmp59, label %if.then60, label %if.end62
+
+if.then60:                                        ; preds = %if.then58
+  %inc61 = add nsw i32 %e.5, 1
+  br label %if.end62
+
+if.end62:                                         ; preds = %if.then60, %if.then58
+  %e.6 = phi i32 [ %inc61, %if.then60 ], [ %e.5, %if.then58 ]
+  %add63 = add nsw i32 %e.6, 1
+  %cmp64 = icmp sge i32 %add63, %i
+  br i1 %cmp64, label %if.then65, label %if.end628
+
+if.then65:                                        ; preds = %if.end62
+  br label %if.end628
+
+if.else68:                                        ; preds = %if.else
+  %add69 = add nsw i32 %e.5, 2
+  %cmp70 = icmp sge i32 %add69, %i
+  br i1 %cmp70, label %if.then71, label %if.end628
+
+if.then71:                                        ; preds = %if.else68
+  %idxprom73 = sext i32 %add69 to i64
+  br label %if.end628
+
+if.else78:                                        ; preds = %if.then
+  %call = call i32 @g()
+  %tobool79 = icmp ne i32 %call, 0
+  br i1 %tobool79, label %if.then80, label %if.else123
+
+if.then80:                                        ; preds = %if.else78
+  %add81 = add nsw i32 %e.0, 3
+  %cmp82 = icmp sge i32 %add81, %i
+  br i1 %cmp82, label %if.then83, label %if.end87
+
+if.then83:                                        ; preds = %if.then80
+  %idxprom85 = sext i32 %add81 to i64
+  br label %if.end87
+
+if.end87:                                         ; preds = %if.then83, %if.then80
+  br i1 %cmp82, label %if.then90, label %if.end94
+
+if.then90:                                        ; preds = %if.end87
+  %idxprom92 = sext i32 %add81 to i64
+  br label %if.end94
+
+if.end94:                                         ; preds = %if.then90, %if.end87
+  br i1 %cmp82, label %if.then97, label %if.end99
+
+if.then97:                                        ; preds = %if.end94
+  br label %if.end99
+
+if.end99:                                         ; preds = %if.then97, %if.end94
+  %e.7 = phi i32 [ %add81, %if.then97 ], [ %e.0, %if.end94 ]
+  %cmp100 = icmp sge i32 %e.7, %i
+  br i1 %cmp100, label %if.then101, label %if.end103
+
+if.then101:                                       ; preds = %if.end99
+  %inc102 = add nsw i32 %e.7, 1
+  br label %if.end103
+
+if.end103:                                        ; preds = %if.then101, %if.end99
+  %e.8 = phi i32 [ %inc102, %if.then101 ], [ %e.7, %if.end99 ]
+  %add104 = add nsw i32 %e.8, 1
+  %cmp105 = icmp sge i32 %add104, %i
+  br i1 %cmp105, label %if.then106, label %if.end108
+
+if.then106:                                       ; preds = %if.end103
+  br label %if.end108
+
+if.end108:                                        ; preds = %if.then106, %if.end103
+  %e.9 = phi i32 [ %add104, %if.then106 ], [ %e.8, %if.end103 ]
+  %cmp109 = icmp sge i32 %e.9, %i
+  br i1 %cmp109, label %if.then110, label %if.end113
+
+if.then110:                                       ; preds = %if.end108
+  %idxprom111 = sext i32 %e.9 to i64
+  br label %if.end113
+
+if.end113:                                        ; preds = %if.then110, %if.end108
+  br i1 %cmp109, label %if.then115, label %if.end118
+
+if.then115:                                       ; preds = %if.end113
+  %idxprom116 = sext i32 %e.9 to i64
+  unreachable
+
+if.end118:                                        ; preds = %if.end113
+  br i1 %cmp109, label %if.then120, label %if.end628
+
+if.then120:                                       ; preds = %if.end118
+  br label %if.end628
+
+if.else123:                                       ; preds = %if.else78
+  %call124 = call i32 @g()
+  %tobool125 = icmp ne i32 %call124, 0
+  br i1 %tobool125, label %if.then126, label %if.end628
+
+if.then126:                                       ; preds = %if.else123
+  %call127 = call i32 @g()
+  %tobool128 = icmp ne i32 %call127, 0
+  br i1 %tobool128, label %if.then129, label %if.else164
+
+if.then129:                                       ; preds = %if.then126
+  %add130 = add nsw i32 %e.0, 1
+  %cmp131 = icmp sge i32 %add130, %i
+  br i1 %cmp131, label %if.then132, label %if.end134
+
+if.then132:                                       ; preds = %if.then129
+  br label %if.end134
+
+if.end134:                                        ; preds = %if.then132, %if.then129
+  %e.10 = phi i32 [ %add130, %if.then132 ], [ %e.0, %if.then129 ]
+  %cmp135 = icmp sge i32 %e.10, %i
+  br i1 %cmp135, label %if.then136, label %if.end139
+
+if.then136:                                       ; preds = %if.end134
+  %idxprom137 = sext i32 %e.10 to i64
+  br label %if.end139
+
+if.end139:                                        ; preds = %if.then136, %if.end134
+  br i1 %cmp135, label %if.then141, label %if.end144
+
+if.then141:                                       ; preds = %if.end139
+  %idxprom142 = sext i32 %e.10 to i64
+  br label %if.end144
+
+if.end144:                                        ; preds = %if.then141, %if.end139
+  br i1 %cmp135, label %if.then146, label %if.end149
+
+if.then146:                                       ; preds = %if.end144
+  %idxprom147 = sext i32 %e.10 to i64
+  br label %if.end149
+
+if.end149:                                        ; preds = %if.then146, %if.end144
+  br i1 %cmp135, label %if.then151, label %if.else154
+
+if.then151:                                       ; preds = %if.end149
+  %idxprom152 = sext i32 %e.10 to i64
+  br label %if.end160
+
+if.else154:                                       ; preds = %if.end149
+  %idxprom157 = sext i32 %e.10 to i64
+  br label %if.end160
+
+if.end160:                                        ; preds = %if.else154, %if.then151
+  br i1 %cmp135, label %if.then162, label %if.end628
+
+if.then162:                                       ; preds = %if.end160
+  unreachable
+
+if.else164:                                       ; preds = %if.then126
+  %4 = load i32*, i32** @c, align 8
+  %tobool165 = icmp ne i32* %4, null
+  br i1 %tobool165, label %if.then166, label %if.else195
+
+if.then166:                                       ; preds = %if.else164
+  %add167 = add nsw i32 %e.0, 1
+  %cmp168 = icmp sge i32 %add167, %i
+  br i1 %cmp168, label %if.then169, label %if.end173
+
+if.then169:                                       ; preds = %if.then166
+  %idxprom171 = sext i32 %add167 to i64
+  br label %if.end173
+
+if.end173:                                        ; preds = %if.then169, %if.then166
+  br i1 %cmp168, label %if.then176, label %if.end180
+
+if.then176:                                       ; preds = %if.end173
+  %idxprom178 = sext i32 %add167 to i64
+  unreachable
+
+if.end180:                                        ; preds = %if.end173
+  br i1 %cmp168, label %if.then183, label %if.end187
+
+if.then183:                                       ; preds = %if.end180
+  %idxprom185 = sext i32 %add167 to i64
+  unreachable
+
+if.end187:                                        ; preds = %if.end180
+  br i1 %cmp168, label %if.then190, label %if.end628
+
+if.then190:                                       ; preds = %if.end187
+  br label %if.end628
+
+if.else195:                                       ; preds = %if.else164
+  %5 = load i32, i32* @d, align 4
+  %tobool196 = icmp ne i32 %5, 0
+  br i1 %tobool196, label %if.then197, label %if.else205
+
+if.then197:                                       ; preds = %if.else195
+  %add198 = add nsw i32 %e.0, 1
+  %cmp199 = icmp sge i32 %add198, %i
+  br i1 %cmp199, label %if.then200, label %if.end628
+
+if.then200:                                       ; preds = %if.then197
+  %idxprom202 = sext i32 %add198 to i64
+  br label %if.end628
+
+if.else205:                                       ; preds = %if.else195
+  %call206 = call i32 @h()
+  %tobool207 = icmp ne i32 %call206, 0
+  br i1 %tobool207, label %if.then208, label %if.else217
+
+if.then208:                                       ; preds = %if.else205
+  %add209 = add nsw i32 %e.0, 1
+  %cmp210 = icmp sge i32 %add209, %i
+  br i1 %cmp210, label %if.then211, label %if.end215
+
+if.then211:                                       ; preds = %if.then208
+  %idxprom213 = sext i32 %add209 to i64
+  unreachable
+
+if.end215:                                        ; preds = %if.then208
+  %6 = zext i32 %add209 to i64
+  br label %if.end628
+
+if.else217:                                       ; preds = %if.else205
+  %7 = load i32*, i32** @c, align 8
+  %tobool218 = icmp ne i32* %7, null
+  br i1 %tobool218, label %if.then219, label %if.else227
+
+if.then219:                                       ; preds = %if.else217
+  %add220 = add nsw i32 %e.0, 1
+  %cmp221 = icmp sge i32 %add220, %i
+  br i1 %cmp221, label %if.then222, label %if.end628
+
+if.then222:                                       ; preds = %if.then219
+  %idxprom224 = sext i32 %add220 to i64
+  br label %if.end628
+
+if.else227:                                       ; preds = %if.else217
+  %call228 = call i32 @g()
+  %tobool229 = icmp ne i32 %call228, 0
+  br i1 %tobool229, label %if.then230, label %if.else245
+
+if.then230:                                       ; preds = %if.else227
+  %add231 = add nsw i32 %e.0, 1
+  %cmp232 = icmp sge i32 %add231, %i
+  br i1 %cmp232, label %if.then233, label %if.end237
+
+if.then233:                                       ; preds = %if.then230
+  %idxprom235 = sext i32 %add231 to i64
+  br label %if.end237
+
+if.end237:                                        ; preds = %if.then233, %if.then230
+  br i1 %cmp232, label %if.then240, label %if.end628
+
+if.then240:                                       ; preds = %if.end237
+  %idxprom242 = sext i32 %add231 to i64
+  br label %if.end628
+
+if.else245:                                       ; preds = %if.else227
+  %8 = load i32*, i32** @c, align 8
+  %tobool246 = icmp ne i32* %8, null
+  br i1 %tobool246, label %if.then247, label %if.else258
+
+if.then247:                                       ; preds = %if.else245
+  %add248 = add nsw i32 %e.0, 1
+  %cmp249 = icmp sge i32 %add248, %i
+  br i1 %cmp249, label %if.then250, label %if.end254
+
+if.then250:                                       ; preds = %if.then247
+  %idxprom252 = sext i32 %add248 to i64
+  unreachable
+
+if.end254:                                        ; preds = %if.then247
+  %9 = zext i32 %add248 to i64
+  br label %if.end628
+
+if.else258:                                       ; preds = %if.else245
+  %10 = load i32, i32* @d, align 4
+  %tobool259 = icmp ne i32 %10, 0
+  br i1 %tobool259, label %if.then260, label %if.else268
+
+if.then260:                                       ; preds = %if.else258
+  %add261 = add nsw i32 %e.0, 1
+  %cmp262 = icmp sge i32 %add261, %i
+  br i1 %cmp262, label %if.then263, label %if.end628
+
+if.then263:                                       ; preds = %if.then260
+  %idxprom265 = sext i32 %add261 to i64
+  br label %if.end628
+
+if.else268:                                       ; preds = %if.else258
+  %call269 = call i32 @h()
+  %tobool270 = icmp ne i32 %call269, 0
+  br i1 %tobool270, label %if.then271, label %if.else279
+
+if.then271:                                       ; preds = %if.else268
+  %add272 = add nsw i32 %e.0, 1
+  %cmp273 = icmp sge i32 %add272, %i
+  br i1 %cmp273, label %if.then274, label %if.end628
+
+if.then274:                                       ; preds = %if.then271
+  %idxprom276 = sext i32 %add272 to i64
+  br label %if.end628
+
+if.else279:                                       ; preds = %if.else268
+  %11 = load i32*, i32** @c, align 8
+  %tobool280 = icmp ne i32* %11, null
+  br i1 %tobool280, label %if.then281, label %if.else287
+
+if.then281:                                       ; preds = %if.else279
+  %add282 = add nsw i32 %e.0, 2
+  %cmp283 = icmp sge i32 %add282, %i
+  br i1 %cmp283, label %if.then284, label %if.end628
+
+if.then284:                                       ; preds = %if.then281
+  br label %if.end628
+
+if.else287:                                       ; preds = %if.else279
+  %call288 = call i32 @g()
+  %tobool289 = icmp ne i32 %call288, 0
+  br i1 %tobool289, label %if.then290, label %if.else307
+
+if.then290:                                       ; preds = %if.else287
+  %12 = load i32*, i32** @c, align 8
+  %tobool291 = icmp ne i32* %12, null
+  br i1 %tobool291, label %if.then292, label %if.else298
+
+if.then292:                                       ; preds = %if.then290
+  %add293 = add nsw i32 %e.0, 3
+  %cmp294 = icmp sge i32 %add293, %i
+  br i1 %cmp294, label %if.then295, label %if.end628
+
+if.then295:                                       ; preds = %if.then292
+  br label %if.end628
+
+if.else298:                                       ; preds = %if.then290
+  %add299 = add nsw i32 %e.0, 4
+  %cmp300 = icmp sge i32 %add299, %i
+  br i1 %cmp300, label %if.then301, label %if.end628
+
+if.then301:                                       ; preds = %if.else298
+  %idxprom303 = sext i32 %add299 to i64
+  br label %if.end628
+
+if.else307:                                       ; preds = %if.else287
+  %13 = load i32*, i32** @c, align 8
+  %tobool308 = icmp ne i32* %13, null
+  br i1 %tobool308, label %if.then309, label %if.else324
+
+if.then309:                                       ; preds = %if.else307
+  %add310 = add nsw i32 %e.0, 1
+  %cmp311 = icmp sge i32 %add310, %i
+  br i1 %cmp311, label %if.then312, label %if.else316
+
+if.then312:                                       ; preds = %if.then309
+  %idxprom314 = sext i32 %add310 to i64
+  br label %if.end628
+
+if.else316:                                       ; preds = %if.then309
+  br i1 undef, label %if.then318, label %if.end628
+
+if.then318:                                       ; preds = %if.else316
+  %idxprom320 = sext i32 %add310 to i64
+  br label %if.end628
+
+if.else324:                                       ; preds = %if.else307
+  %call325 = call i32 @g()
+  %tobool326 = icmp ne i32 %call325, 0
+  br i1 %tobool326, label %if.then327, label %if.else475
+
+if.then327:                                       ; preds = %if.else324
+  %add328 = add nsw i32 %e.0, 2
+  %cmp329 = icmp sge i32 %add328, %i
+  br i1 %cmp329, label %if.then330, label %if.end332
+
+if.then330:                                       ; preds = %if.then327
+  br label %if.end332
+
+if.end332:                                        ; preds = %if.then330, %if.then327
+  %e.11 = phi i32 [ %add328, %if.then330 ], [ %e.0, %if.then327 ]
+  %cmp333 = icmp sge i32 %e.11, %i
+  br i1 %cmp333, label %if.then334, label %if.end336
+
+if.then334:                                       ; preds = %if.end332
+  %inc335 = add nsw i32 %e.11, 1
+  br label %if.end336
+
+if.end336:                                        ; preds = %if.then334, %if.end332
+  %e.12 = phi i32 [ %inc335, %if.then334 ], [ %e.11, %if.end332 ]
+  %cmp337 = icmp sge i32 %e.12, %i
+  br i1 %cmp337, label %if.then338, label %if.end340
+
+if.then338:                                       ; preds = %if.end336
+  %inc339 = add nsw i32 %e.12, 1
+  br label %if.end340
+
+if.end340:                                        ; preds = %if.then338, %if.end336
+  %e.13 = phi i32 [ %inc339, %if.then338 ], [ %e.12, %if.end336 ]
+  %cmp341 = icmp sge i32 %e.13, %i
+  br i1 %cmp341, label %if.then342, label %if.end344
+
+if.then342:                                       ; preds = %if.end340
+  %inc343 = add nsw i32 %e.13, 1
+  br label %if.end344
+
+if.end344:                                        ; preds = %if.then342, %if.end340
+  %e.14 = phi i32 [ %inc343, %if.then342 ], [ %e.13, %if.end340 ]
+  %call345 = call i32 @g()
+  %tobool346 = icmp ne i32 %call345, 0
+  br i1 %tobool346, label %if.then347, label %if.else398
+
+if.then347:                                       ; preds = %if.end344
+  %cmp348 = icmp sge i32 %e.14, %i
+  br i1 %cmp348, label %if.then349, label %if.end352
+
+if.then349:                                       ; preds = %if.then347
+  %idxprom350 = sext i32 %e.14 to i64
+  br label %if.end352
+
+if.end352:                                        ; preds = %if.then349, %if.then347
+  br i1 %cmp348, label %if.then354, label %if.else357
+
+if.then354:                                       ; preds = %if.end352
+  %idxprom355 = sext i32 %e.14 to i64
+  br label %if.end361
+
+if.else357:                                       ; preds = %if.end352
+  %idxprom359 = sext i32 %e.14 to i64
+  br label %if.end361
+
+if.end361:                                        ; preds = %if.else357, %if.then354
+  br i1 %cmp348, label %if.then363, label %if.end366
+
+if.then363:                                       ; preds = %if.end361
+  %idxprom364 = sext i32 %e.14 to i64
+  br label %if.end366
+
+if.end366:                                        ; preds = %if.then363, %if.end361
+  br i1 %cmp348, label %if.then368, label %if.end371
+
+if.then368:                                       ; preds = %if.end366
+  %idxprom369 = sext i32 %e.14 to i64
+  br label %if.end371
+
+if.end371:                                        ; preds = %if.then368, %if.end366
+  br i1 %cmp348, label %if.then373, label %if.end376
+
+if.then373:                                       ; preds = %if.end371
+  %idxprom374 = sext i32 %e.14 to i64
+  br label %if.end376
+
+if.end376:                                        ; preds = %if.then373, %if.end371
+  br i1 %cmp348, label %if.then378, label %if.end381
+
+if.then378:                                       ; preds = %if.end376
+  %idxprom379 = sext i32 %e.14 to i64
+  br label %if.end381
+
+if.end381:                                        ; preds = %if.then378, %if.end376
+  br i1 %cmp348, label %if.then383, label %if.else386
+
+if.then383:                                       ; preds = %if.end381
+  %idxprom384 = sext i32 %e.14 to i64
+  br label %if.end390
+
+if.else386:                                       ; preds = %if.end381
+  %idxprom388 = sext i32 %e.14 to i64
+  br label %if.end390
+
+if.end390:                                        ; preds = %if.else386, %if.then383
+  %add391 = add nsw i32 %e.14, 1
+  %cmp392 = icmp sge i32 %add391, %i
+  br i1 %cmp392, label %if.then393, label %if.end628
+
+if.then393:                                       ; preds = %if.end390
+  %idxprom395 = sext i32 %add391 to i64
+  br label %if.end628
+
+if.else398:                                       ; preds = %if.end344
+  %call399 = call i32 @h()
+  %tobool400 = icmp ne i32 %call399, 0
+  br i1 %tobool400, label %if.then401, label %if.else409
+
+if.then401:                                       ; preds = %if.else398
+  %add402 = add nsw i32 %e.14, 1
+  %cmp403 = icmp sge i32 %add402, %i
+  br i1 %cmp403, label %if.then404, label %if.end628
+
+if.then404:                                       ; preds = %if.then401
+  %idxprom406 = sext i32 %add402 to i64
+  br label %if.end628
+
+if.else409:                                       ; preds = %if.else398
+  %call410 = call i32 @h()
+  %tobool411 = icmp ne i32 %call410, 0
+  br i1 %tobool411, label %if.then412, label %if.else420
+
+if.then412:                                       ; preds = %if.else409
+  %add413 = add nsw i32 %e.14, 1
+  %cmp414 = icmp sge i32 %add413, %i
+  br i1 %cmp414, label %if.then415, label %if.end628
+
+if.then415:                                       ; preds = %if.then412
+  %idxprom417 = sext i32 %add413 to i64
+  br label %if.end628
+
+if.else420:                                       ; preds = %if.else409
+  %call421 = call i32 @h()
+  %tobool422 = icmp ne i32 %call421, 0
+  br i1 %tobool422, label %if.then423, label %if.else431
+
+if.then423:                                       ; preds = %if.else420
+  %add424 = add nsw i32 %e.14, 3
+  %cmp425 = icmp sge i32 %add424, %i
+  br i1 %cmp425, label %if.then426, label %if.end628
+
+if.then426:                                       ; preds = %if.then423
+  %idxprom428 = sext i32 %add424 to i64
+  br label %if.end628
+
+if.else431:                                       ; preds = %if.else420
+  %call432 = call i32 @h()
+  %tobool433 = icmp ne i32 %call432, 0
+  br i1 %tobool433, label %if.then434, label %if.else440
+
+if.then434:                                       ; preds = %if.else431
+  %add435 = add nsw i32 %e.14, 1
+  %cmp436 = icmp sge i32 %add435, %i
+  br i1 %cmp436, label %if.then437, label %if.end628
+
+if.then437:                                       ; preds = %if.then434
+  br label %if.end628
+
+if.else440:                                       ; preds = %if.else431
+  %call441 = call i32 @h()
+  %tobool442 = icmp ne i32 %call441, 0
+  br i1 %tobool442, label %if.then443, label %if.else451
+
+if.then443:                                       ; preds = %if.else440
+  %tobool444 = icmp ne i32 %e.14, 0
+  br i1 %tobool444, label %if.then445, label %if.end628
+
+if.then445:                                       ; preds = %if.then443
+  %cmp446 = icmp sge i32 %e.14, %i
+  br i1 %cmp446, label %if.then447, label %if.end628
+
+if.then447:                                       ; preds = %if.then445
+  br label %if.end628
+
+if.else451:                                       ; preds = %if.else440
+  %call452 = call i32 @h()
+  %tobool453 = icmp ne i32 %call452, 0
+  br i1 %tobool453, label %if.then454, label %if.else460
+
+if.then454:                                       ; preds = %if.else451
+  %add455 = add nsw i32 %e.14, 1
+  %cmp456 = icmp sge i32 %add455, %i
+  br i1 %cmp456, label %if.then457, label %if.end628
+
+if.then457:                                       ; preds = %if.then454
+  br label %if.end628
+
+if.else460:                                       ; preds = %if.else451
+  %add461 = add nsw i32 %e.14, 2
+  %cmp462 = icmp sge i32 %add461, %i
+  br i1 %cmp462, label %if.then463, label %if.end628
+
+if.then463:                                       ; preds = %if.else460
+  %idxprom465 = sext i32 %add461 to i64
+  br label %if.end628
+
+if.else475:                                       ; preds = %if.else324
+  %call476 = call i32 @g()
+  %tobool477 = icmp ne i32 %call476, 0
+  br i1 %tobool477, label %if.then478, label %if.else509
+
+if.then478:                                       ; preds = %if.else475
+  %call479 = call i32 @h()
+  %tobool480 = icmp ne i32 %call479, 0
+  br i1 %tobool480, label %if.then481, label %if.else487
+
+if.then481:                                       ; preds = %if.then478
+  %add482 = add nsw i32 %e.0, 1
+  %cmp483 = icmp sge i32 %add482, %i
+  br i1 %cmp483, label %if.then484, label %if.end628
+
+if.then484:                                       ; preds = %if.then481
+  br label %if.end628
+
+if.else487:                                       ; preds = %if.then478
+  %call488 = call i32 @h()
+  %tobool489 = icmp ne i32 %call488, 0
+  br i1 %tobool489, label %if.then490, label %if.else496
+
+if.then490:                                       ; preds = %if.else487
+  %add491 = add nsw i32 %e.0, 1
+  %cmp492 = icmp sge i32 %add491, %i
+  br i1 %cmp492, label %if.then493, label %if.end628
+
+if.then493:                                       ; preds = %if.then490
+  br label %if.end628
+
+if.else496:                                       ; preds = %if.else487
+  %add497 = add nsw i32 %e.0, 1
+  %cmp498 = icmp sge i32 %add497, %i
+  br i1 %cmp498, label %if.then499, label %if.else501
+
+if.then499:                                       ; preds = %if.else496
+  br label %if.end628
+
+if.else501:                                       ; preds = %if.else496
+  br i1 undef, label %if.then503, label %if.end628
+
+if.then503:                                       ; preds = %if.else501
+  br label %if.end628
+
+if.else509:                                       ; preds = %if.else475
+  %call510 = call i32 @g()
+  %tobool511 = icmp ne i32 %call510, 0
+  br i1 %tobool511, label %if.then512, label %if.else565
+
+if.then512:                                       ; preds = %if.else509
+  %add513 = add nsw i32 %e.0, 1
+  %cmp514 = icmp sge i32 %add513, %i
+  br i1 %cmp514, label %if.then515, label %if.end519
+
+if.then515:                                       ; preds = %if.then512
+  %idxprom517 = sext i32 %add513 to i64
+  br label %if.end519
+
+if.end519:                                        ; preds = %if.then515, %if.then512
+  br i1 %cmp514, label %if.then522, label %if.end526
+
+if.then522:                                       ; preds = %if.end519
+  %idxprom524 = sext i32 %add513 to i64
+  br label %if.end526
+
+if.end526:                                        ; preds = %if.then522, %if.end519
+  br i1 %cmp514, label %if.then529, label %if.end533
+
+if.then529:                                       ; preds = %if.end526
+  %idxprom531 = sext i32 %add513 to i64
+  br label %if.end533
+
+if.end533:                                        ; preds = %if.then529, %if.end526
+  %add534 = add nsw i32 %e.0, 2
+  %cmp535 = icmp sge i32 %add534, %i
+  br i1 %cmp535, label %if.then536, label %if.end540
+
+if.then536:                                       ; preds = %if.end533
+  %idxprom538 = sext i32 %add534 to i64
+  br label %if.end540
+
+if.end540:                                        ; preds = %if.then536, %if.end533
+  br i1 %cmp535, label %if.then543, label %if.end547
+
+if.then543:                                       ; preds = %if.end540
+  %idxprom545 = sext i32 %add534 to i64
+  unreachable
+
+if.end547:                                        ; preds = %if.end540
+  br i1 %cmp514, label %if.then550, label %if.else554
+
+if.then550:                                       ; preds = %if.end547
+  %idxprom552 = sext i32 %add513 to i64
+  br label %if.end559
+
+if.else554:                                       ; preds = %if.end547
+  %idxprom557 = sext i32 %add513 to i64
+  br label %if.end559
+
+if.end559:                                        ; preds = %if.else554, %if.then550
+  br i1 %cmp514, label %if.then562, label %if.end628
+
+if.then562:                                       ; preds = %if.end559
+  br label %if.end628
+
+if.else565:                                       ; preds = %if.else509
+  %call566 = call i32 @g()
+  %tobool567 = icmp ne i32 %call566, 0
+  br i1 %tobool567, label %if.then568, label %if.else590
+
+if.then568:                                       ; preds = %if.else565
+  %add569 = add nsw i32 %e.0, 2
+  %cmp570 = icmp sge i32 %add569, %i
+  br i1 %cmp570, label %if.then571, label %if.else575
+
+if.then571:                                       ; preds = %if.then568
+  %idxprom573 = sext i32 %add569 to i64
+  br label %if.end582
+
+if.else575:                                       ; preds = %if.then568
+  %idxprom579 = sext i32 %add569 to i64
+  br label %if.end582
+
+if.end582:                                        ; preds = %if.else575, %if.then571
+  %add583 = add nsw i32 %e.0, 1
+  %cmp584 = icmp sge i32 %add583, %i
+  br i1 %cmp584, label %if.then585, label %if.end628
+
+if.then585:                                       ; preds = %if.end582
+  %idxprom587 = sext i32 %add583 to i64
+  br label %if.end628
+
+if.else590:                                       ; preds = %if.else565
+  %call591 = call i32 @g()
+  %tobool592 = icmp ne i32 %call591, 0
+  br i1 %tobool592, label %if.then593, label %if.end628
+
+if.then593:                                       ; preds = %if.else590
+  %add594 = add nsw i32 %e.0, 1
+  %cmp595 = icmp sge i32 %add594, %i
+  br i1 %cmp595, label %if.then596, label %if.else600
+
+if.then596:                                       ; preds = %if.then593
+  %idxprom598 = sext i32 %add594 to i64
+  br label %if.end628
+
+if.else600:                                       ; preds = %if.then593
+  br i1 undef, label %if.then602, label %if.end628
+
+if.then602:                                       ; preds = %if.else600
+  %idxprom604 = sext i32 %add594 to i64
+  br label %if.end628
+
+if.end628:                                        ; preds = %if.then602, %if.else600, %if.then596, %if.else590, %if.then585, %if.end582, %if.then562, %if.end559, %if.then503, %if.else501, %if.then499, %if.then493, %if.then490, %if.then484, %if.then481, %if.then463, %if.else460, %if.then457, %if.then454, %if.then447, %if.then445, %if.then443, %if.then437, %if.then434, %if.then426, %if.then423, %if.then415, %if.then412, %if.then404, %if.then401, %if.then393, %if.end390, %if.then318, %if.else316, %if.then312, %if.then301, %if.else298, %if.then295, %if.then292, %if.then284, %if.then281, %if.then274, %if.then271, %if.then263, %if.then260, %if.end254, %if.then240, %if.end237, %if.then222, %if.then219, %if.end215, %if.then200, %if.then197, %if.then190, %if.end187, %if.end160, %if.else123, %if.then120, %if.end118, %if.then71, %if.else68, %if.then65, %if.end62, %if.then53, %for.cond
+  %e.15 = phi i32 [ %e.5, %if.then53 ], [ %add63, %if.then65 ], [ %e.6, %if.end62 ], [ %e.5, %if.then71 ], [ %e.5, %if.else68 ], [ %e.9, %if.then120 ], [ %e.9, %if.end118 ], [ %e.10, %if.end160 ], [ %e.0, %if.then190 ], [ %e.0, %if.end187 ], [ %e.0, %if.then200 ], [ %e.0, %if.then197 ], [ %e.0, %if.end215 ], [ %e.0, %if.then222 ], [ %e.0, %if.then219 ], [ %e.0, %if.then240 ], [ %e.0, %if.end237 ], [ %e.0, %if.end254 ], [ %e.0, %if.then263 ], [ %e.0, %if.then260 ], [ %e.0, %if.then274 ], [ %e.0, %if.then271 ], [ %add282, %if.then284 ], [ %e.0, %if.then281 ], [ %add293, %if.then295 ], [ %e.0, %if.then292 ], [ %e.0, %if.then301 ], [ %e.0, %if.else298 ], [ %e.0, %if.then312 ], [ %e.0, %if.then318 ], [ %e.0, %if.else316 ], [ %e.14, %if.then393 ], [ %e.14, %if.end390 ], [ %e.14, %if.then404 ], [ %e.14, %if.then401 ], [ %e.14, %if.then415 ], [ %e.14, %if.then412 ], [ %e.14, %if.then426 ], [ %e.14, %if.then423 ], [ %add435, %if.then437 ], [ %e.14, %if.then434 ], [ %e.14, %if.then447 ], [ %e.14, %if.then445 ], [ %e.14, %if.then443 ], [ %add455, %if.then457 ], [ %e.14, %if.then454 ], [ %e.14, %if.then463 ], [ %e.14, %if.else460 ], [ %add482, %if.then484 ], [ %e.0, %if.then481 ], [ %add491, %if.then493 ], [ %e.0, %if.then490 ], [ %add497, %if.then499 ], [ %add497, %if.then503 ], [ %e.0, %if.else501 ], [ %add513, %if.then562 ], [ %e.0, %if.end559 ], [ %e.0, %if.then585 ], [ %e.0, %if.end582 ], [ %e.0, %if.then596 ], [ %e.0, %if.then602 ], [ %e.0, %if.else600 ], [ %e.0, %if.else590 ], [ %e.0, %if.else123 ], [ %e.0, %for.cond ]
+  br label %for.cond
+}
+
+declare i32 @g()
+
+declare i32 @h()
+
+; Function Attrs: nofree nosync nounwind willreturn
+declare void @llvm.assume(i1 noundef)
+

From 33d312b2d731507327252fd597bac1b738870330 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Tue, 11 May 2021 20:35:17 -0700
Subject: [PATCH 244/318] Revert "[Coverage] Fix branch coverage merging in
 FunctionCoverageSummary::get() for instantiation"

This reverts commit b89942c336a4506dabd5e1b2a6f1b5cbaddebe55.

There are issues with this patch and also no tracking bug for it.
---
 llvm/test/tools/llvm-cov/branch-templates.cpp | 16 +---------------
 llvm/tools/llvm-cov/CoverageSummaryInfo.cpp   |  6 +++++-
 llvm/tools/llvm-cov/CoverageSummaryInfo.h     |  5 -----
 3 files changed, 6 insertions(+), 21 deletions(-)

diff --git a/llvm/test/tools/llvm-cov/branch-templates.cpp b/llvm/test/tools/llvm-cov/branch-templates.cpp
index 4797428f8835..750dc7bd58f2 100644
--- a/llvm/test/tools/llvm-cov/branch-templates.cpp
+++ b/llvm/test/tools/llvm-cov/branch-templates.cpp
@@ -1,9 +1,9 @@
 // RUN: llvm-profdata merge %S/Inputs/branch-templates.proftext -o %t.profdata
 // RUN: llvm-cov show --show-expansions --show-branches=count %S/Inputs/branch-templates.o32l -instr-profile %t.profdata -path-equivalence=/tmp,%S %s | FileCheck %s
 // RUN: llvm-cov report --show-branch-summary %S/Inputs/branch-templates.o32l -instr-profile %t.profdata -show-functions -path-equivalence=/tmp,%S %s | FileCheck %s -check-prefix=REPORT
-// RUN: llvm-cov report --show-branch-summary %S/Inputs/branch-templates.o32l -instr-profile %t.profdata -path-equivalence=/tmp,%S %s | FileCheck %s -check-prefix=REPORTFILE
 
 #include <stdio.h>
+
 template<typename T>
 void unused(T x) {
   return;
@@ -45,17 +45,3 @@ int main() {
 // REPORT-NEXT: _Z4funcIfEiT_                     5       2  60.00%         7       3  57.14%         2       1  50.00%
 // REPORT-NEXT: ---
 // REPORT-NEXT: TOTAL                            22       7  68.18%        31      11  64.52%        12       6  50.00%
-
-// Make sure the covered branch tally for the function instantiation group is
-// merged to reflect maximum branch coverage of a single instantiation, just
-// like what is done for lines and regions. Also, the total branch tally
-// summary for an instantiation group should agree with the total number of
-// branches in the definition (In this case, 2 and 6 for func<>() and main(),
-// respectively).  This is returned by: FunctionCoverageSummary::get(const
-// InstantiationGroup &Group, ...)
-
-// REPORTFILE: Filename                      Regions    Missed Regions     Cover   Functions  Missed Functions  Executed       Lines      Missed Lines     Cover    Branches   Missed Branches     Cover
-// REPORTFILE-NEXT: ---
-// REPORTFILE-NEXT: branch-templates.cpp          12                 3    75.00%           2                 0   100.00%          17                 4    76.47%           8                 4    50.00%
-// REPORTFILE-NEXT: ---
-// REPORTFILE-NEXT: TOTAL                              12                 3    75.00%           2                 0   100.00%          17                 4    76.47%           8                 4    50.00%
diff --git a/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp b/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp
index 10e059adeb7d..4a0a86168908 100644
--- a/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp
+++ b/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp
@@ -100,7 +100,11 @@ FunctionCoverageSummary::get(const InstantiationGroup &Group,
   for (const auto &FCS : Summaries.drop_front()) {
     Summary.RegionCoverage.merge(FCS.RegionCoverage);
     Summary.LineCoverage.merge(FCS.LineCoverage);
-    Summary.BranchCoverage.merge(FCS.BranchCoverage);
+
+    // Sum branch coverage across instantiation groups for the summary rather
+    // than "merge" the maximum count. This is a clearer view into whether all
+    // created branches are covered.
+    Summary.BranchCoverage += FCS.BranchCoverage;
   }
   return Summary;
 }
diff --git a/llvm/tools/llvm-cov/CoverageSummaryInfo.h b/llvm/tools/llvm-cov/CoverageSummaryInfo.h
index 62e7cad1012b..4bc1c24a079f 100644
--- a/llvm/tools/llvm-cov/CoverageSummaryInfo.h
+++ b/llvm/tools/llvm-cov/CoverageSummaryInfo.h
@@ -123,11 +123,6 @@ class BranchCoverageInfo {
     return *this;
   }
 
-  void merge(const BranchCoverageInfo &RHS) {
-    Covered = std::max(Covered, RHS.Covered);
-    NumBranches = std::max(NumBranches, RHS.NumBranches);
-  }
-
   size_t getCovered() const { return Covered; }
 
   size_t getNumBranches() const { return NumBranches; }

From 6912082cfd129bbc2bd60f293371e20140d50b86 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 20 Apr 2021 11:23:10 -0700
Subject: [PATCH 245/318] [ELF] Don't set versionId on undefined weak lazy
 symbols

An unfetched lazy symbol (undefined weak) should be considered to have its
original versionId which is VER_NDX_GLOBAL, instead of the lazy symbol's
versionId. (The original versionId cannot be non-VER_NDX_GLOBAL because a
undefined versioned symbol is an error.)

The regression was introduced in D77280 when making version scripts work
with lazy symbols fetched by LTO calls.

Fix PR49915

Differential Revision: https://reviews.llvm.org/D100624

(cherry picked from commit 1c00530b30e21fd0f5b316401f6485bee08ce850)
---
 lld/ELF/SyntheticSections.cpp      |  4 +++-
 lld/test/ELF/version-script-weak.s | 13 +++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 9a875bd7ec3e..70c36c63d101 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -3110,7 +3110,9 @@ size_t VersionTableSection::getSize() const {
 void VersionTableSection::writeTo(uint8_t *buf) {
   buf += 2;
   for (const SymbolTableEntry &s : getPartition().dynSymTab->getSymbols()) {
-    write16(buf, s.sym->versionId);
+    // Use the original versionId for an unfetched lazy symbol (undefined weak),
+    // which must be VER_NDX_GLOBAL (an undefined versioned symbol is an error).
+    write16(buf, s.sym->isLazy() ? VER_NDX_GLOBAL : s.sym->versionId);
     buf += 2;
   }
 }
diff --git a/lld/test/ELF/version-script-weak.s b/lld/test/ELF/version-script-weak.s
index 7c902eb98bf4..cfa2455ee2bd 100644
--- a/lld/test/ELF/version-script-weak.s
+++ b/lld/test/ELF/version-script-weak.s
@@ -24,6 +24,19 @@
 # CHECK-NEXT:   Section: Undefined
 # CHECK-NEXT: }
 
+## The version of an unfetched lazy symbol is VER_NDX_GLOBAL. It is not affected by version scripts.
+# RUN: echo "v1 { *; };" > %t2.script
+# RUN: ld.lld -shared --version-script %t2.script %t.o --start-lib %t1.o --end-lib -o %t2.so
+# RUN: llvm-readelf --dyn-syms %t2.so | FileCheck %s --check-prefix=CHECK2
+
+# CHECK2: NOTYPE WEAK DEFAULT UND foo{{$}}
+
+# RUN: ld.lld -shared --soname=tshared --version-script %t2.script %t1.o -o %tshared.so
+# RUN: ld.lld -shared --version-script %t2.script %t.o --start-lib %t1.o --end-lib %tshared.so -o %t3.so
+# RUN: llvm-readelf --dyn-syms %t3.so | FileCheck %s --check-prefix=CHECK3
+
+# CHECK3: NOTYPE WEAK DEFAULT UND foo@v1
+
 .text
  callq foo@PLT
 .weak foo

From 0ef78361565a861cac846b7c1f807dc2d278145d Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 4 May 2021 11:24:14 +0100
Subject: [PATCH 246/318] [IndVarSimplify] Add additional tests using
 isImpliedViaMerge.

(cherry picked from commit d65e5f60f110046898ad146c508a7d225d398549)
---
 .../IndVarSimplify/eliminate-exit.ll          | 55 ++++++++++++++++
 .../promote-iv-to-eliminate-casts.ll          | 62 +++++++++++++++++++
 2 files changed, 117 insertions(+)

diff --git a/llvm/test/Transforms/IndVarSimplify/eliminate-exit.ll b/llvm/test/Transforms/IndVarSimplify/eliminate-exit.ll
index ddf8ada68e95..eec7908b6a8b 100644
--- a/llvm/test/Transforms/IndVarSimplify/eliminate-exit.ll
+++ b/llvm/test/Transforms/IndVarSimplify/eliminate-exit.ll
@@ -436,3 +436,58 @@ exit:
 }
 
 declare void @side_effect()
+
+; The exit condition %outer.cond.1 depends on a phi in %inner. Make sure we do
+; not incorrectly determine %x.lcssa <= -1.
+define i32 @exit_cond_depends_on_inner_loop() {
+; CHECK-LABEL: @exit_cond_depends_on_inner_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer.header:
+; CHECK-NEXT:    [[IV_OUTER:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_OUTER_NEXT:%.*]], [[OUTER_LATCH:%.*]] ]
+; CHECK-NEXT:    br label [[INNER:%.*]]
+; CHECK:       inner:
+; CHECK-NEXT:    [[X:%.*]] = phi i32 [ -1, [[OUTER_HEADER]] ], [ [[CALL:%.*]], [[INNER]] ]
+; CHECK-NEXT:    [[CALL]] = call i32 @match()
+; CHECK-NEXT:    [[INNER_COND:%.*]] = icmp sgt i32 [[CALL]], -1
+; CHECK-NEXT:    br i1 [[INNER_COND]], label [[INNER]], label [[OUTER_EXITING_1:%.*]]
+; CHECK:       outer.exiting.1:
+; CHECK-NEXT:    [[X_LCSSA:%.*]] = phi i32 [ [[X]], [[INNER]] ]
+; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[OUTER_LATCH]]
+; CHECK:       outer.latch:
+; CHECK-NEXT:    [[IV_OUTER_NEXT]] = add nuw nsw i32 [[IV_OUTER]], 1
+; CHECK-NEXT:    [[OUTER_COND_2:%.*]] = icmp ult i32 [[IV_OUTER]], 100
+; CHECK-NEXT:    br i1 [[OUTER_COND_2]], label [[OUTER_HEADER]], label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[X_RES:%.*]] = phi i32 [ [[X_LCSSA]], [[OUTER_EXITING_1]] ], [ -1, [[OUTER_LATCH]] ]
+; CHECK-NEXT:    ret i32 [[X_RES]]
+;
+entry:
+  br label %outer.header
+
+outer.header:
+  %iv.outer = phi i32 [ 0, %entry ], [ %iv.outer.next , %outer.latch ]
+  br label %inner
+
+inner:
+  %x = phi i32 [ -1, %outer.header ], [ %call, %inner ]
+  %call = call i32 @match()
+  %inner.cond = icmp sgt i32 %call, -1
+  br i1 %inner.cond, label %inner, label %outer.exiting.1
+
+outer.exiting.1:
+  %x.lcssa = phi i32 [ %x, %inner ]
+  %outer.cond.1 = icmp sgt i32 %x.lcssa, -1
+  br i1 %outer.cond.1, label %exit, label %outer.latch
+
+outer.latch:
+  %iv.outer.next = add nuw nsw i32 %iv.outer, 1
+  %outer.cond.2 = icmp ult i32 %iv.outer, 100
+  br i1 %outer.cond.2, label %outer.header, label %exit
+
+exit:
+  %x.res = phi i32 [ %x.lcssa, %outer.exiting.1 ], [ -1, %outer.latch ]
+  ret i32 %x.res
+}
+
+declare i32 @match()
diff --git a/llvm/test/Transforms/IndVarSimplify/promote-iv-to-eliminate-casts.ll b/llvm/test/Transforms/IndVarSimplify/promote-iv-to-eliminate-casts.ll
index 5cc288c58e68..2ecea576c380 100644
--- a/llvm/test/Transforms/IndVarSimplify/promote-iv-to-eliminate-casts.ll
+++ b/llvm/test/Transforms/IndVarSimplify/promote-iv-to-eliminate-casts.ll
@@ -373,4 +373,66 @@ loop:
   br i1 %loopcond, label %loopexit, label %loop
 }
 
+define void @promote_latch_condition_decrementing_loop_05(i32* %p, i32* %a, i1 %cond) {
+; CHECK-LABEL: @promote_latch_condition_decrementing_loop_05(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LEN:%.*]] = load i32, i32* [[P:%.*]], align 4, [[RNG0]]
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.true:
+; CHECK-NEXT:    br label [[MERGE:%.*]]
+; CHECK:       if.false:
+; CHECK-NEXT:    [[LEN_MINUS_1:%.*]] = add nsw i32 [[LEN]], -1
+; CHECK-NEXT:    br label [[MERGE]]
+; CHECK:       merge:
+; CHECK-NEXT:    [[IV_START:%.*]] = phi i32 [ [[LEN]], [[IF_TRUE]] ], [ [[LEN_MINUS_1]], [[IF_FALSE]] ]
+; CHECK-NEXT:    [[ZERO_CHECK:%.*]] = icmp eq i32 [[LEN]], 0
+; CHECK-NEXT:    br i1 [[ZERO_CHECK]], label [[LOOPEXIT:%.*]], label [[PREHEADER:%.*]]
+; CHECK:       preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[IV_START]] to i64
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loopexit.loopexit:
+; CHECK-NEXT:    br label [[LOOPEXIT]]
+; CHECK:       loopexit:
+; CHECK-NEXT:    ret void
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ], [ [[TMP0]], [[PREHEADER]] ]
+; CHECK-NEXT:    [[EL:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store atomic i32 0, i32* [[EL]] unordered, align 4
+; CHECK-NEXT:    [[LOOPCOND:%.*]] = icmp slt i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    br i1 [[LOOPCOND]], label [[LOOPEXIT_LOOPEXIT:%.*]], label [[LOOP]]
+;
+
+entry:
+  %len = load i32, i32* %p, align 4, !range !0
+  br i1 %cond, label %if.true, label %if.false
+
+if.true:
+  br label %merge
+
+if.false:
+  %len.minus.1 = add nsw i32 %len, -1
+  br label %merge
+
+merge:
+  %iv_start = phi i32 [ %len, %if.true ], [%len.minus.1, %if.false ]
+  %zero_check = icmp eq i32 %len, 0
+  br i1 %zero_check, label %loopexit, label %preheader
+
+preheader:
+  br label %loop
+
+loopexit:
+  ret void
+
+loop:
+  %iv = phi i32 [ %iv.next, %loop ], [ %iv_start, %preheader ]
+  %iv.wide = zext i32 %iv to i64
+  %el = getelementptr inbounds i32, i32* %a, i64 %iv.wide
+  store atomic i32 0, i32* %el unordered, align 4
+  %iv.next = add nsw i32 %iv, -1
+  %loopcond = icmp slt i32 %iv, 1
+  br i1 %loopcond, label %loopexit, label %loop
+}
+
 !0 = !{i32 0, i32 2147483647}

From 4e46ff469405bc73ec25fcf78126fb5fbd7a18a1 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 7 May 2021 19:39:05 +0100
Subject: [PATCH 247/318] [SCEV] By more careful when traversing phis in
 isImpliedViaMerge.

I think currently isImpliedViaMerge can incorrectly return true for phis
in a loop/cycle, if the found condition involves the previous value of

Consider the case in exit_cond_depends_on_inner_loop.

At some point, we call (modulo simplifications)
isImpliedViaMerge(<=, %x.lcssa, -1, %call, -1).

The existing code tries to prove IncV <= -1 for all incoming values
InvV using the found condition (%call <= -1). At the moment this succeeds,
but only because it does not compare the same runtime value. The found
condition checks the value of the last iteration, but the incoming value
is from the *previous* iteration.

Hence we incorrectly determine that the *previous* value was <= -1,
which may not be true.

I think we need to be more careful when looking at the incoming values
here. In particular, we need to rule out that a found condition refers to
any value that may refer to one of the previous iterations. I'm not sure
there's a reliable way to do so (that also works of irreducible control
flow).

So for now this patch adds an additional requirement that the incoming
value must properly dominate the phi block. This should ensure the
values do not change in a cycle. I am not entirely sure if will catch
all cases and I appreciate a through second look in that regard.

Alternatively we could also unconditionally bail out in this case,
instead of checking the incoming values

Reviewed By: nikic

Differential Revision: https://reviews.llvm.org/D101829

(cherry picked from commit 6c99e631201aaea0a75708749cbaf2ba08a493f9)
---
 llvm/lib/Analysis/ScalarEvolution.cpp                 |  4 ++++
 llvm/test/Transforms/IRCE/decrementing-loop.ll        | 11 ++++++-----
 llvm/test/Transforms/IndVarSimplify/eliminate-exit.ll |  3 ++-
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index fe9d8297d679..1a9ae68573e9 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -10622,6 +10622,10 @@ bool ScalarEvolution::isImpliedViaMerge(ICmpInst::Predicate Pred,
       if (!dominates(RHS, IncBB))
         return false;
       const SCEV *L = getSCEV(LPhi->getIncomingValueForBlock(IncBB));
+      // Make sure L does not refer to a value from a potentially previous
+      // iteration of a loop.
+      if (!properlyDominates(L, IncBB))
+        return false;
       if (!ProvedEasily(L, RHS))
         return false;
     }
diff --git a/llvm/test/Transforms/IRCE/decrementing-loop.ll b/llvm/test/Transforms/IRCE/decrementing-loop.ll
index a824522cf206..d809fb4f7d97 100644
--- a/llvm/test/Transforms/IRCE/decrementing-loop.ll
+++ b/llvm/test/Transforms/IRCE/decrementing-loop.ll
@@ -212,16 +212,17 @@ exit:
   ret void
 }
 
+; TODO: we need to be more careful when trying to look through phi nodes in
+; cycles, because the condition to prove may reference the previous value of
+; the phi. So we currently fail to optimize this case.
 ; Check that we can figure out that IV is non-negative via implication through
 ; two Phi nodes, one being AddRec.
 define void @test_05(i32* %a, i32* %a_len_ptr, i1 %cond) {
 
 ; CHECK-LABEL: test_05
-; CHECK:       mainloop:
-; CHECK-NEXT:    br label %loop
-; CHECK:       loop:
-; CHECK:         br i1 true, label %in.bounds, label %out.of.bounds
-; CHECK:       loop.preloop:
+; CHECK: entry:
+; CHECK:   br label %merge
+; CHECK-NOT: mainloop
 
  entry:
   %len.a = load i32, i32* %a_len_ptr, !range !0
diff --git a/llvm/test/Transforms/IndVarSimplify/eliminate-exit.ll b/llvm/test/Transforms/IndVarSimplify/eliminate-exit.ll
index eec7908b6a8b..e574c2f84ea3 100644
--- a/llvm/test/Transforms/IndVarSimplify/eliminate-exit.ll
+++ b/llvm/test/Transforms/IndVarSimplify/eliminate-exit.ll
@@ -453,7 +453,8 @@ define i32 @exit_cond_depends_on_inner_loop() {
 ; CHECK-NEXT:    br i1 [[INNER_COND]], label [[INNER]], label [[OUTER_EXITING_1:%.*]]
 ; CHECK:       outer.exiting.1:
 ; CHECK-NEXT:    [[X_LCSSA:%.*]] = phi i32 [ [[X]], [[INNER]] ]
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[OUTER_LATCH]]
+; CHECK-NEXT:    [[OUTER_COND_1:%.*]] = icmp sgt i32 [[X_LCSSA]], -1
+; CHECK-NEXT:    br i1 [[OUTER_COND_1]], label [[EXIT:%.*]], label [[OUTER_LATCH]]
 ; CHECK:       outer.latch:
 ; CHECK-NEXT:    [[IV_OUTER_NEXT]] = add nuw nsw i32 [[IV_OUTER]], 1
 ; CHECK-NEXT:    [[OUTER_COND_2:%.*]] = icmp ult i32 [[IV_OUTER]], 100

From 4eb7b15cb447b339e82bd320adf4a09ca64ab839 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Sat, 8 May 2021 17:05:05 +0200
Subject: [PATCH 248/318] [Inliner] Fix noalias metadata handling for
 instructions simplified during cloning (PR50270)

Instead of using VMap, which may include instructions from the
caller as a result of simplification, iterate over the
(FirstNewBlock, Caller->end()) range, which will only include new
instructions.

Fixes https://bugs.llvm.org/show_bug.cgi?id=50270.

Differential Revision: https://reviews.llvm.org/D102110

(cherry picked from commit aa9b02ac75350a6c7c949dd24d5c6a931be26ff9)
---
 llvm/lib/Transforms/Utils/InlineFunction.cpp | 106 ++++++++-----------
 llvm/test/Transforms/Inline/pr50270.ll       |  71 +++++++++++++
 2 files changed, 117 insertions(+), 60 deletions(-)
 create mode 100644 llvm/test/Transforms/Inline/pr50270.ll

diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 3026342cc4a6..fb271a2118ba 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -780,7 +780,8 @@ static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock,
 /// When inlining a call site that has !llvm.mem.parallel_loop_access,
 /// !llvm.access.group, !alias.scope or !noalias metadata, that metadata should
 /// be propagated to all memory-accessing cloned instructions.
-static void PropagateCallSiteMetadata(CallBase &CB, ValueToValueMapTy &VMap) {
+static void PropagateCallSiteMetadata(CallBase &CB, Function::iterator FStart,
+                                      Function::iterator FEnd) {
   MDNode *MemParallelLoopAccess =
       CB.getMetadata(LLVMContext::MD_mem_parallel_loop_access);
   MDNode *AccessGroup = CB.getMetadata(LLVMContext::MD_access_group);
@@ -789,41 +790,33 @@ static void PropagateCallSiteMetadata(CallBase &CB, ValueToValueMapTy &VMap) {
   if (!MemParallelLoopAccess && !AccessGroup && !AliasScope && !NoAlias)
     return;
 
-  for (ValueToValueMapTy::iterator VMI = VMap.begin(), VMIE = VMap.end();
-       VMI != VMIE; ++VMI) {
-    // Check that key is an instruction, to skip the Argument mapping, which
-    // points to an instruction in the original function, not the inlined one.
-    if (!VMI->second || !isa<Instruction>(VMI->first))
-      continue;
-
-    Instruction *NI = dyn_cast<Instruction>(VMI->second);
-    if (!NI)
-      continue;
-
-    // This metadata is only relevant for instructions that access memory.
-    if (!NI->mayReadOrWriteMemory())
-      continue;
+  for (BasicBlock &BB : make_range(FStart, FEnd)) {
+    for (Instruction &I : BB) {
+      // This metadata is only relevant for instructions that access memory.
+      if (!I.mayReadOrWriteMemory())
+        continue;
 
-    if (MemParallelLoopAccess) {
-      // TODO: This probably should not overwrite MemParalleLoopAccess.
-      MemParallelLoopAccess = MDNode::concatenate(
-          NI->getMetadata(LLVMContext::MD_mem_parallel_loop_access),
-          MemParallelLoopAccess);
-      NI->setMetadata(LLVMContext::MD_mem_parallel_loop_access,
+      if (MemParallelLoopAccess) {
+        // TODO: This probably should not overwrite MemParalleLoopAccess.
+        MemParallelLoopAccess = MDNode::concatenate(
+            I.getMetadata(LLVMContext::MD_mem_parallel_loop_access),
+            MemParallelLoopAccess);
+        I.setMetadata(LLVMContext::MD_mem_parallel_loop_access,
                       MemParallelLoopAccess);
-    }
+      }
 
-    if (AccessGroup)
-      NI->setMetadata(LLVMContext::MD_access_group, uniteAccessGroups(
-          NI->getMetadata(LLVMContext::MD_access_group), AccessGroup));
+      if (AccessGroup)
+        I.setMetadata(LLVMContext::MD_access_group, uniteAccessGroups(
+            I.getMetadata(LLVMContext::MD_access_group), AccessGroup));
 
-    if (AliasScope)
-      NI->setMetadata(LLVMContext::MD_alias_scope, MDNode::concatenate(
-          NI->getMetadata(LLVMContext::MD_alias_scope), AliasScope));
+      if (AliasScope)
+        I.setMetadata(LLVMContext::MD_alias_scope, MDNode::concatenate(
+            I.getMetadata(LLVMContext::MD_alias_scope), AliasScope));
 
-    if (NoAlias)
-      NI->setMetadata(LLVMContext::MD_noalias, MDNode::concatenate(
-          NI->getMetadata(LLVMContext::MD_noalias), NoAlias));
+      if (NoAlias)
+        I.setMetadata(LLVMContext::MD_noalias, MDNode::concatenate(
+            I.getMetadata(LLVMContext::MD_noalias), NoAlias));
+    }
   }
 }
 
@@ -844,9 +837,9 @@ class ScopedAliasMetadataDeepCloner {
   /// subsequent remap() calls.
   void clone();
 
-  /// Remap instructions in the given VMap from the original to the cloned
+  /// Remap instructions in the given range from the original to the cloned
   /// metadata.
-  void remap(ValueToValueMapTy &VMap);
+  void remap(Function::iterator FStart, Function::iterator FEnd);
 };
 
 ScopedAliasMetadataDeepCloner::ScopedAliasMetadataDeepCloner(
@@ -907,34 +900,27 @@ void ScopedAliasMetadataDeepCloner::clone() {
   }
 }
 
-void ScopedAliasMetadataDeepCloner::remap(ValueToValueMapTy &VMap) {
+void ScopedAliasMetadataDeepCloner::remap(Function::iterator FStart,
+                                          Function::iterator FEnd) {
   if (MDMap.empty())
     return; // Nothing to do.
 
-  for (auto Entry : VMap) {
-    // Check that key is an instruction, to skip the Argument mapping, which
-    // points to an instruction in the original function, not the inlined one.
-    if (!Entry->second || !isa<Instruction>(Entry->first))
-      continue;
-
-    Instruction *I = dyn_cast<Instruction>(Entry->second);
-    if (!I)
-      continue;
-
-    // Only update scopes when we find them in the map. If they are not, it is
-    // because we already handled that instruction before. This is faster than
-    // tracking which instructions we already updated.
-    if (MDNode *M = I->getMetadata(LLVMContext::MD_alias_scope))
-      if (MDNode *MNew = MDMap.lookup(M))
-        I->setMetadata(LLVMContext::MD_alias_scope, MNew);
-
-    if (MDNode *M = I->getMetadata(LLVMContext::MD_noalias))
-      if (MDNode *MNew = MDMap.lookup(M))
-        I->setMetadata(LLVMContext::MD_noalias, MNew);
-
-    if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(I))
-      if (MDNode *MNew = MDMap.lookup(Decl->getScopeList()))
-        Decl->setScopeList(MNew);
+  for (BasicBlock &BB : make_range(FStart, FEnd)) {
+    for (Instruction &I : BB) {
+      // TODO: The null checks for the MDMap.lookup() results should no longer
+      // be necessary.
+      if (MDNode *M = I.getMetadata(LLVMContext::MD_alias_scope))
+        if (MDNode *MNew = MDMap.lookup(M))
+          I.setMetadata(LLVMContext::MD_alias_scope, MNew);
+
+      if (MDNode *M = I.getMetadata(LLVMContext::MD_noalias))
+        if (MDNode *MNew = MDMap.lookup(M))
+          I.setMetadata(LLVMContext::MD_noalias, MNew);
+
+      if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I))
+        if (MDNode *MNew = MDMap.lookup(Decl->getScopeList()))
+          Decl->setScopeList(MNew);
+    }
   }
 }
 
@@ -1926,7 +1912,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
 
     // Now clone the inlined noalias scope metadata.
     SAMetadataCloner.clone();
-    SAMetadataCloner.remap(VMap);
+    SAMetadataCloner.remap(FirstNewBlock, Caller->end());
 
     // Add noalias metadata if necessary.
     AddAliasScopeMetadata(CB, VMap, DL, CalleeAAR);
@@ -1936,7 +1922,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
     AddReturnAttributes(CB, VMap);
 
     // Propagate metadata on the callsite if necessary.
-    PropagateCallSiteMetadata(CB, VMap);
+    PropagateCallSiteMetadata(CB, FirstNewBlock, Caller->end());
 
     // Register any cloned assumptions.
     if (IFI.GetAssumptionCache)
diff --git a/llvm/test/Transforms/Inline/pr50270.ll b/llvm/test/Transforms/Inline/pr50270.ll
new file mode 100644
index 000000000000..be7c3379ce87
--- /dev/null
+++ b/llvm/test/Transforms/Inline/pr50270.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -inline < %s | FileCheck %s
+
+; This tests cases where instructions in the callee are simplified to
+; instructions in the caller, thus making VMap contain instructions from
+; the caller. We should not be assigning incorrect noalias metadata in
+; that case.
+
+declare { i64* } @opaque_callee()
+
+define { i64* } @callee(i64* %x) {
+; CHECK-LABEL: @callee(
+; CHECK-NEXT:    [[RES:%.*]] = insertvalue { i64* } undef, i64* [[X:%.*]], 0
+; CHECK-NEXT:    ret { i64* } [[RES]]
+;
+  %res = insertvalue { i64* } undef, i64* %x, 0
+  ret { i64* } %res
+}
+
+; @opaque_callee() should not receive noalias metadata here.
+define void @caller() {
+; CHECK-LABEL: @caller(
+; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata !0)
+; CHECK-NEXT:    [[S:%.*]] = call { i64* } @opaque_callee()
+; CHECK-NEXT:    [[X:%.*]] = extractvalue { i64* } [[S]], 0
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.experimental.noalias.scope.decl(metadata !0)
+  %s = call { i64* } @opaque_callee()
+  %x = extractvalue { i64* } %s, 0
+  call { i64* } @callee(i64* %x), !noalias !0
+  ret void
+}
+
+; @opaque_callee() should no the same noalias metadata as the load from the
+; else branch, not as the load in the if branch.
+define { i64* } @self_caller(i1 %c, i64* %a) {
+; CHECK-LABEL: @self_caller(
+; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata !0)
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[S:%.*]] = call { i64* } @opaque_callee(), !noalias !0
+; CHECK-NEXT:    [[X:%.*]] = extractvalue { i64* } [[S]], 0
+; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata !3)
+; CHECK-NEXT:    [[TMP1:%.*]] = load volatile i64, i64* [[X]], align 4, !alias.scope !3
+; CHECK-NEXT:    ret { i64* } [[S]]
+; CHECK:       else:
+; CHECK-NEXT:    [[R2:%.*]] = insertvalue { i64* } undef, i64* [[A:%.*]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = load volatile i64, i64* [[A]], align 4, !alias.scope !0
+; CHECK-NEXT:    ret { i64* } [[R2]]
+;
+  call void @llvm.experimental.noalias.scope.decl(metadata !0)
+  br i1 %c, label %if, label %else
+
+if:
+  %s = call { i64* } @opaque_callee(), !noalias !0
+  %x = extractvalue { i64* } %s, 0
+  %r = call { i64* } @self_caller(i1 false, i64* %x)
+  ret { i64* } %r
+
+else:
+  %r2 = insertvalue { i64* } undef, i64* %a, 0
+  load volatile i64, i64* %a, !alias.scope !0
+  ret { i64* } %r2
+}
+
+declare void @llvm.experimental.noalias.scope.decl(metadata)
+
+!0 = !{!1}
+!1 = !{!1, !2, !"scope"}
+!2 = !{!2, !"domain"}

From 877a07bfb3b9a0bbaa3b63a222448b11d85821f0 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 26 Mar 2021 17:29:36 -0400
Subject: [PATCH 249/318] GlobalISel: Restrict narrow scalar for fptoui/fptosi
 results

This practically only works for the f16 case AMDGPU uses, not wider
types.

Fixes bug 49710 by failing legalization.

(cherry picked from commit 83a25a101051b404bec1a5ba9cb867705f31262d)
---
 .../llvm/CodeGen/GlobalISel/LegalizerHelper.h |  1 +
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    | 44 ++++++++++++-------
 .../AArch64/GlobalISel/legalize-fptoi.mir     | 28 ++++++++++++
 3 files changed, 57 insertions(+), 16 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index c3b494e94ff1..4a982b00125d 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -316,6 +316,7 @@ class LegalizerHelper {
 
   LegalizeResult narrowScalarShift(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
   LegalizeResult narrowScalarMul(MachineInstr &MI, LLT Ty);
+  LegalizeResult narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
   LegalizeResult narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
   LegalizeResult narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx, LLT Ty);
 
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 3178ee16af2b..66871ca3b926 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -1257,22 +1257,9 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     Observer.changedInstr(MI);
     return Legalized;
   }
-  case TargetOpcode::G_FPTOUI: {
-    if (TypeIdx != 0)
-      return UnableToLegalize;
-    Observer.changingInstr(MI);
-    narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
-    Observer.changedInstr(MI);
-    return Legalized;
-  }
-  case TargetOpcode::G_FPTOSI: {
-    if (TypeIdx != 0)
-      return UnableToLegalize;
-    Observer.changingInstr(MI);
-    narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_SEXT);
-    Observer.changedInstr(MI);
-    return Legalized;
-  }
+  case TargetOpcode::G_FPTOUI:
+  case TargetOpcode::G_FPTOSI:
+    return narrowScalarFPTOI(MI, TypeIdx, NarrowTy);
   case TargetOpcode::G_FPEXT:
     if (TypeIdx != 0)
       return UnableToLegalize;
@@ -4496,6 +4483,31 @@ LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) {
   return Legalized;
 }
 
+LegalizerHelper::LegalizeResult
+LegalizerHelper::narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx,
+                                   LLT NarrowTy) {
+  if (TypeIdx != 0)
+    return UnableToLegalize;
+
+  bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI;
+
+  Register Src = MI.getOperand(1).getReg();
+  LLT SrcTy = MRI.getType(Src);
+
+  // If all finite floats fit into the narrowed integer type, we can just swap
+  // out the result type. This is practically only useful for conversions from
+  // half to at least 16-bits, so just handle the one case.
+  if (SrcTy.getScalarType() != LLT::scalar(16) ||
+      NarrowTy.getScalarSizeInBits() < (IsSigned ? 17 : 16))
+    return UnableToLegalize;
+
+  Observer.changingInstr(MI);
+  narrowScalarDst(MI, NarrowTy, 0,
+                  IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT);
+  Observer.changedInstr(MI);
+  return Legalized;
+}
+
 LegalizerHelper::LegalizeResult
 LegalizerHelper::narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx,
                                      LLT NarrowTy) {
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fptoi.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fptoi.mir
index 9bc639679bea..c82cedd08580 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fptoi.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fptoi.mir
@@ -265,3 +265,31 @@ body: |
     %1:_(<4 x s32>) = G_FPTOSI %0
     $q0 = COPY %1
 ...
+
+---
+name:            test_fptoui_s128_s32
+body: |
+  bb.0:
+    liveins: $w0
+    ; CHECK-LABEL: name: test_fptoui_s128_s32
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK: [[FPTOUI:%[0-9]+]]:_(s128) = G_FPTOUI [[COPY]](s32)
+    ; CHECK: $q0 = COPY [[FPTOUI]](s128)
+    %0:_(s32) = COPY $w0
+    %1:_(s128) = G_FPTOUI %0
+    $q0 = COPY %1
+...
+
+---
+name:            test_fptosi_s128_s32
+body: |
+  bb.0:
+    liveins: $w0
+    ; CHECK-LABEL: name: test_fptosi_s128_s32
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK: [[FPTOSI:%[0-9]+]]:_(s128) = G_FPTOSI [[COPY]](s32)
+    ; CHECK: $q0 = COPY [[FPTOSI]](s128)
+    %0:_(s32) = COPY $w0
+    %1:_(s128) = G_FPTOSI %0
+    $q0 = COPY %1
+...

From 6336c6eec1a1c12205d102c5555101c100c5ec73 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sat, 27 Mar 2021 10:39:27 -0400
Subject: [PATCH 250/318] AArch64/GlobalISel: Remove IR section from test

(cherry picked from commit 2f779e79d50114830c02cdb9e77bd851e13d9fc1)
---
 .../AArch64/GlobalISel/legalize-fptoi.mir     | 29 +------------------
 1 file changed, 1 insertion(+), 28 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fptoi.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fptoi.mir
index c82cedd08580..b2ee3a6cc777 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fptoi.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fptoi.mir
@@ -1,32 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-
-  define void @test_fptosi_s32_s32() { ret void }
-  define void @test_fptoui_s32_s32() { ret void }
-  define void @test_fptosi_s32_s64() { ret void }
-  define void @test_fptoui_s32_s64() { ret void }
-
-  define void @test_fptosi_s64_s32() { ret void }
-  define void @test_fptoui_s64_s32() { ret void }
-  define void @test_fptosi_s64_s64() { ret void }
-  define void @test_fptoui_s64_s64() { ret void }
-
-  define void @test_fptosi_s1_s32() { ret void }
-  define void @test_fptoui_s1_s32() { ret void }
-
-  define void @test_fptosi_s8_s64() { ret void }
-  define void @test_fptoui_s8_s64() { ret void }
-
-  define void @test_fptosi_s16_s32() { ret void }
-  define void @test_fptoui_s16_s32() { ret void }
-
-  define void @test_fptoui_v4s32() { ret void }
-  define void @test_fptosi_v4s32() { ret void }
-...
+# RUN: llc -mtriple=aarch64-- -O0 -run-pass=legalizer %s -o - | FileCheck %s
 
 ---
 name:            test_fptosi_s32_s32

From de579bae6eabd02b815e549776b8c680957a0769 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Fri, 14 May 2021 12:36:41 -0700
Subject: [PATCH 251/318] [LowerConstantIntrinsics] reuse isManifestLogic from
 ConstantFolding

GlobalVariables are Constants, yet should not unconditionally be
considered true for __builtin_constant_p.

Via the LangRef
https://llvm.org/docs/LangRef.html#llvm-is-constant-intrinsic:

    This intrinsic generates no code. If its argument is known to be a
    manifest compile-time constant value, then the intrinsic will be
    converted to a constant true value. Otherwise, it will be converted
    to a constant false value.

    In particular, note that if the argument is a constant expression
    which refers to a global (the address of which _is_ a constant, but
    not manifest during the compile), then the intrinsic evaluates to
    false.

Move isManifestConstant from ConstantFolding to be a method of
Constant so that we can reuse the same logic in
LowerConstantIntrinsics.

pr/41459

Reviewed By: rsmith, george.burgess.iv

Differential Revision: https://reviews.llvm.org/D102367

(cherry picked from commit 8c72749bd92d35397e93908bc5a504d4cbcef1cb)
---
 llvm/include/llvm/IR/Constant.h                   |  4 ++++
 llvm/lib/Analysis/ConstantFolding.cpp             | 15 +--------------
 llvm/lib/IR/Constants.cpp                         | 12 ++++++++++++
 .../Transforms/Scalar/LowerConstantIntrinsics.cpp |  8 ++++----
 .../constant-intrinsics.ll                        |  8 ++++++++
 5 files changed, 29 insertions(+), 18 deletions(-)

diff --git a/llvm/include/llvm/IR/Constant.h b/llvm/include/llvm/IR/Constant.h
index 0190aca27b72..71692c746015 100644
--- a/llvm/include/llvm/IR/Constant.h
+++ b/llvm/include/llvm/IR/Constant.h
@@ -214,6 +214,10 @@ class Constant : public User {
   /// both must either be scalars or vectors with the same element count. If no
   /// changes are made, the constant C is returned.
   static Constant *mergeUndefsWith(Constant *C, Constant *Other);
+
+  /// Return true if a constant is ConstantData or a ConstantAggregate or
+  /// ConstantExpr that contain only ConstantData.
+  bool isManifestConstant() const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index f73890d548f0..cc1ce4c65821 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1808,19 +1808,6 @@ double getValueAsDouble(ConstantFP *Op) {
   return APF.convertToDouble();
 }
 
-static bool isManifestConstant(const Constant *c) {
-  if (isa<ConstantData>(c)) {
-    return true;
-  } else if (isa<ConstantAggregate>(c) || isa<ConstantExpr>(c)) {
-    for (const Value *subc : c->operand_values()) {
-      if (!isManifestConstant(cast<Constant>(subc)))
-        return false;
-    }
-    return true;
-  }
-  return false;
-}
-
 static bool getConstIntOrUndef(Value *Op, const APInt *&C) {
   if (auto *CI = dyn_cast<ConstantInt>(Op)) {
     C = &CI->getValue();
@@ -1845,7 +1832,7 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
     // We know we have a "Constant" argument. But we want to only
     // return true for manifest constants, not those that depend on
     // constants with unknowable values, e.g. GlobalValue or BlockAddress.
-    if (isManifestConstant(Operands[0]))
+    if (Operands[0]->isManifestConstant())
       return ConstantInt::getTrue(Ty->getContext());
     return nullptr;
   }
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index 6fd205c654a8..9f05917cf7cc 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -803,6 +803,18 @@ Constant *Constant::mergeUndefsWith(Constant *C, Constant *Other) {
   return C;
 }
 
+bool Constant::isManifestConstant() const {
+  if (isa<ConstantData>(this))
+    return true;
+  if (isa<ConstantAggregate>(this) || isa<ConstantExpr>(this)) {
+    for (const Value *Op : operand_values())
+      if (!cast<Constant>(Op)->isManifestConstant())
+        return false;
+    return true;
+  }
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 //                                ConstantInt
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
index bfe8db83b027..bb30c48127a0 100644
--- a/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
@@ -43,10 +43,10 @@ STATISTIC(ObjectSizeIntrinsicsHandled,
           "Number of 'objectsize' intrinsic calls handled");
 
 static Value *lowerIsConstantIntrinsic(IntrinsicInst *II) {
-  Value *Op = II->getOperand(0);
-
-  return isa<Constant>(Op) ? ConstantInt::getTrue(II->getType())
-                           : ConstantInt::getFalse(II->getType());
+  if (auto *C = dyn_cast<Constant>(II->getOperand(0)))
+    if (C->isManifestConstant())
+      return ConstantInt::getTrue(II->getType());
+  return ConstantInt::getFalse(II->getType());
 }
 
 static bool replaceConditionalBranchesOnConstant(Instruction *II,
diff --git a/llvm/test/Transforms/LowerConstantIntrinsics/constant-intrinsics.ll b/llvm/test/Transforms/LowerConstantIntrinsics/constant-intrinsics.ll
index b2c98d2049cd..e19dce1b5543 100644
--- a/llvm/test/Transforms/LowerConstantIntrinsics/constant-intrinsics.ll
+++ b/llvm/test/Transforms/LowerConstantIntrinsics/constant-intrinsics.ll
@@ -112,3 +112,11 @@ define i1 @test_various_types(i256 %int, float %float, <2 x i64> %vec, {i32, i32
 
   ret i1 %res6
 }
+
+@real_mode_blob_end = external dso_local global [0 x i8], align 1
+define i1 @global_array() {
+; CHECK-LABEL: @global_array(
+; CHECK-NEXT: ret i1 false
+  %1 = call i1 @llvm.is.constant.i64(i64 ptrtoint ([0 x i8]* @real_mode_blob_end to i64))
+  ret i1 %1
+}

From 4973ce53ca8abfc14233a3d8b3045673e0e8543c Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Wed, 12 May 2021 16:09:37 +0300
Subject: [PATCH 252/318] ~(C + X) --> ~C - X (PR50308)

We can not rely on (C+X)-->(X+C) already happening,
because we might not have visited that `add` yet.
The added testcase would get stuck in an endless combine loop.

(cherry-picked from 554b1bced325)
---
 .../InstCombine/InstCombineAndOrXor.cpp       | 17 +++++------
 llvm/test/Transforms/InstCombine/not-add.ll   | 28 +++++++++++++++++++
 2 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 68c4156af2c4..85a7abe211b3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -3221,11 +3221,6 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
       }
     }
 
-    // ~(X - Y) --> ~X + Y
-    if (match(NotVal, m_Sub(m_Value(X), m_Value(Y))))
-      if (isa<Constant>(X) || NotVal->hasOneUse())
-        return BinaryOperator::CreateAdd(Builder.CreateNot(X), Y);
-
     // ~(~X >>s Y) --> (X >>s Y)
     if (match(NotVal, m_AShr(m_Not(m_Value(X)), m_Value(Y))))
       return BinaryOperator::CreateAShr(X, Y);
@@ -3256,9 +3251,15 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
       return BinaryOperator::CreateAShr(ConstantExpr::getNot(C), Y);
     }
 
-    // ~(X + C) --> -(C + 1) - X
-    if (match(Op0, m_Add(m_Value(X), m_Constant(C))))
-      return BinaryOperator::CreateSub(ConstantExpr::getNeg(AddOne(C)), X);
+    // ~(X + C) --> ~C - X
+    if (match(NotVal, m_c_Add(m_Value(X), m_ImmConstant(C))))
+      return BinaryOperator::CreateSub(ConstantExpr::getNot(C), X);
+
+    // ~(X - Y) --> ~X + Y
+    // FIXME: is it really beneficial to sink the `not` here?
+    if (match(NotVal, m_Sub(m_Value(X), m_Value(Y))))
+      if (isa<Constant>(X) || NotVal->hasOneUse())
+        return BinaryOperator::CreateAdd(Builder.CreateNot(X), Y);
 
     // ~(~X + Y) --> X - Y
     if (match(NotVal, m_c_Add(m_Not(m_Value(X)), m_Value(Y))))
diff --git a/llvm/test/Transforms/InstCombine/not-add.ll b/llvm/test/Transforms/InstCombine/not-add.ll
index 6891fdd5fcc5..d372e7603724 100644
--- a/llvm/test/Transforms/InstCombine/not-add.ll
+++ b/llvm/test/Transforms/InstCombine/not-add.ll
@@ -137,3 +137,31 @@ define <4 x i32> @vector_test_undef_nsw_nuw(<4 x i32> %x, <4 x i32> %y) {
   %nota = xor <4 x i32> %a, <i32 -1, i32 -1, i32 undef, i32 undef>
   ret <4 x i32> %nota
 }
+
+define i32 @pr50308(i1 %c1, i32 %v1, i32 %v2, i32 %v3) {
+; CHECK-LABEL: @pr50308(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C1:%.*]], label [[COND_TRUE:%.*]], label [[COND_END:%.*]]
+; CHECK:       cond.true:
+; CHECK-NEXT:    [[ADD_NOT:%.*]] = sub i32 -2, [[V1:%.*]]
+; CHECK-NEXT:    [[ADD1_NEG:%.*]] = xor i32 [[ADD_NOT]], [[V2:%.*]]
+; CHECK-NEXT:    br label [[COND_END]]
+; CHECK:       cond.end:
+; CHECK-NEXT:    [[COND_NEG:%.*]] = phi i32 [ [[ADD1_NEG]], [[COND_TRUE]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[COND_NEG]], [[V3:%.*]]
+; CHECK-NEXT:    ret i32 [[SUB]]
+;
+entry:
+  br i1 %c1, label %cond.true, label %cond.end
+
+cond.true:
+  %add = add nsw i32 1, %v1
+  %xor = xor i32 %add, %v2
+  %add1 = add nsw i32 1, %xor
+  br label %cond.end
+
+cond.end:
+  %cond = phi i32 [ %add1, %cond.true ], [ 0, %entry ]
+  %sub = sub nsw i32 %v3, %cond
+  ret i32 %sub
+}

From 328a6ec955327c6d56b6bc3478c723dd3cd468ef Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sguelton@redhat.com>
Date: Tue, 11 May 2021 20:46:58 +0200
Subject: [PATCH 253/318] Force visibility of llvm::Any to external

llvm::Any::TypeId::Id relies on the uniqueness of the address of a static
variable defined in a template function. hidden visibility implies vague linkage
for that variable, which does not guarantee the uniqueness of the address across
a binary and a shared library. This totally breaks the implementation of
llvm::Any.

Ideally, setting visibility to llvm::Any::TypeId::Id should be enough,
unfortunately this doesn't work as expected and we lack time (before 12.0.1
release) to understand why setting the visibility to llvm::Any does work.

See https://gcc.gnu.org/wiki/Visibility and
https://gcc.gnu.org/onlinedocs/gcc/Vague-Linkage.html
for more information on that topic.

Differential Revision: https://reviews.llvm.org/D101972

(cherry picked from commit 3d3abc22b3ef189813a3b9061c2a90ba86a32f44)
---
 llvm/include/llvm/ADT/Any.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/ADT/Any.h b/llvm/include/llvm/ADT/Any.h
index 0aded628cda4..1e3abca70679 100644
--- a/llvm/include/llvm/ADT/Any.h
+++ b/llvm/include/llvm/ADT/Any.h
@@ -23,7 +23,12 @@
 
 namespace llvm {
 
-class Any {
+class LLVM_EXTERNAL_VISIBILITY Any {
+
+  // The `Typeid<T>::Id` static data member below is a globally unique
+  // identifier for the type `T`. It is explicitly marked with default
+  // visibility so that when `-fvisibility=hidden` is used, the loader still
+  // merges duplicate definitions across DSO boundaries.
   template <typename T> struct TypeId { static const char Id; };
 
   struct StorageBase {

From 77b63ce55e4d8db1ea1ef45d519b9d49d760d7bb Mon Sep 17 00:00:00 2001
From: "William S. Moses" <gh@wsmoses.com>
Date: Tue, 9 Mar 2021 11:55:19 -0500
Subject: [PATCH 254/318] [MemoryDependence] Fix invariant group store

Fix bug in MemoryDependence [and thus GVN] for invariant group.

Previously MemDep didn't verify that the store was storing into a
pointer rather than a store simply using a pointer.

Differential Revision: https://reviews.llvm.org/D98267

(cherry picked from commit 875891a10d50a791d3f076c9259e60af6c9af18c)
---
 .../lib/Analysis/MemoryDependenceAnalysis.cpp |  4 +++-
 llvm/test/Transforms/GVN/storeinvgroup.ll     | 21 +++++++++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/GVN/storeinvgroup.ll

diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
index 895936d47175..886b5bf4acd3 100644
--- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -344,7 +344,9 @@ MemoryDependenceResults::getInvariantGroupPointerDependency(LoadInst *LI,
       // If we hit load/store with the same invariant.group metadata (and the
       // same pointer operand) we can assume that value pointed by pointer
       // operand didn't change.
-      if ((isa<LoadInst>(U) || isa<StoreInst>(U)) &&
+      if ((isa<LoadInst>(U) ||
+           (isa<StoreInst>(U) &&
+            cast<StoreInst>(U)->getPointerOperand() == Ptr)) &&
           U->hasMetadata(LLVMContext::MD_invariant_group))
         ClosestDependency = GetClosestDependency(ClosestDependency, U);
     }
diff --git a/llvm/test/Transforms/GVN/storeinvgroup.ll b/llvm/test/Transforms/GVN/storeinvgroup.ll
new file mode 100644
index 000000000000..16da4b8cd424
--- /dev/null
+++ b/llvm/test/Transforms/GVN/storeinvgroup.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -gvn -S -o - < %s | FileCheck %s
+
+define double @code(double* %a1) {
+; CHECK-LABEL: @code(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[META:%.*]] = alloca double*, align 8
+; CHECK-NEXT:    store double 1.234500e+00, double* [[A1:%.*]], align 8
+; CHECK-NEXT:    store double* [[A1]], double** [[META]], align 8, !invariant.group !0
+; CHECK-NEXT:    ret double 1.234500e+00
+;
+entry:
+  %meta = alloca double*
+  store double 1.23450000e+00, double* %a1, align 8
+  store double* %a1, double** %meta, align 8, !invariant.group !0
+  %iload = load double, double* %a1, align 8, !invariant.group !1
+  ret double %iload
+}
+
+!0 = distinct !{}
+!1 = distinct !{}

From f2ce10d14b7c07b93b7726615aeb40ad04837a88 Mon Sep 17 00:00:00 2001
From: Stefan Pintilie <stefanp@ca.ibm.com>
Date: Thu, 13 May 2021 05:49:19 -0500
Subject: [PATCH 255/318] [PowerPC] Handle inline assembly clobber of link
 regsiter

This patch adds the handling of clobbers of the link register LR for inline
assembly.

This patch is to fix:
https://bugs.llvm.org/show_bug.cgi?id=50147

Reviewed By: nemanjai, #powerpc

Differential Revision: https://reviews.llvm.org/D101657

(cherry picked from commit 15051f0b4a2e0a0af9da7cd5e5cfaabb9f6aaa3d)
---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   |  62 ++++
 llvm/lib/Target/PowerPC/PPCISelLowering.h     |   1 +
 llvm/lib/Target/PowerPC/PPCRegisterInfo.td    |   7 +
 .../PowerPC/ppc64-inlineasm-clobber.ll        | 264 ++++++++++++++++++
 4 files changed, 334 insertions(+)
 create mode 100644 llvm/test/CodeGen/PowerPC/ppc64-inlineasm-clobber.ll

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 929a72ac687e..7833bfc1d1b6 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -167,6 +167,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   // Sub-word ATOMIC_CMP_SWAP need to ensure that the input is zero-extended.
   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
 
+  // Custom lower inline assembly to check for special registers.
+  setOperationAction(ISD::INLINEASM, MVT::Other, Custom);
+  setOperationAction(ISD::INLINEASM_BR, MVT::Other, Custom);
+
   // PowerPC has an i16 but no i8 (or i1) SEXTLOAD.
   for (MVT VT : MVT::integer_valuetypes()) {
     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
@@ -3461,6 +3465,57 @@ SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
   return Op.getOperand(0);
 }
 
+SDValue PPCTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  PPCFunctionInfo &MFI = *MF.getInfo<PPCFunctionInfo>();
+
+  assert((Op.getOpcode() == ISD::INLINEASM ||
+          Op.getOpcode() == ISD::INLINEASM_BR) &&
+         "Expecting Inline ASM node.");
+
+  // If an LR store is already known to be required then there is not point in
+  // checking this ASM as well.
+  if (MFI.isLRStoreRequired())
+    return Op;
+
+  // Inline ASM nodes have an optional last operand that is an incoming Flag of
+  // type MVT::Glue. We want to ignore this last operand if that is the case.
+  unsigned NumOps = Op.getNumOperands();
+  if (Op.getOperand(NumOps - 1).getValueType() == MVT::Glue)
+    --NumOps;
+
+  // Check all operands that may contain the LR.
+  for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
+    unsigned Flags = cast<ConstantSDNode>(Op.getOperand(i))->getZExtValue();
+    unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+    ++i; // Skip the ID value.
+
+    switch (InlineAsm::getKind(Flags)) {
+    default:
+      llvm_unreachable("Bad flags!");
+    case InlineAsm::Kind_RegUse:
+    case InlineAsm::Kind_Imm:
+    case InlineAsm::Kind_Mem:
+      i += NumVals;
+      break;
+    case InlineAsm::Kind_Clobber:
+    case InlineAsm::Kind_RegDef:
+    case InlineAsm::Kind_RegDefEarlyClobber: {
+      for (; NumVals; --NumVals, ++i) {
+        Register Reg = cast<RegisterSDNode>(Op.getOperand(i))->getReg();
+        if (Reg != PPC::LR && Reg != PPC::LR8)
+          continue;
+        MFI.setLRStoreRequired();
+        return Op;
+      }
+      break;
+    }
+    }
+  }
+
+  return Op;
+}
+
 SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
                                                 SelectionDAG &DAG) const {
   if (Subtarget.isAIXABI())
@@ -10316,6 +10371,8 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
 
+  case ISD::INLINEASM:
+  case ISD::INLINEASM_BR:       return LowerINLINEASM(Op, DAG);
   // Variable argument lowering.
   case ISD::VASTART:            return LowerVASTART(Op, DAG);
   case ISD::VAARG:              return LowerVAARG(Op, DAG);
@@ -15090,6 +15147,11 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       return std::make_pair(0U, &PPC::VSSRCRegClass);
     else
       return std::make_pair(0U, &PPC::VSFRCRegClass);
+  } else if (Constraint == "lr") {
+    if (VT == MVT::i64)
+      return std::make_pair(0U, &PPC::LR8RCRegClass);
+    else
+      return std::make_pair(0U, &PPC::LRRCRegClass);
   }
 
   // If we name a VSX register, we can't defer to the base class because it
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 0dda2c181572..836c52bdff95 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -1128,6 +1128,7 @@ namespace llvm {
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
index e03617aa75ff..45d60369018b 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -409,6 +409,13 @@ def CTRRC8 : RegisterClass<"PPC", [i64], 64, (add CTR8)> {
   let isAllocatable = 0;
 }
 
+def LRRC : RegisterClass<"PPC", [i32], 32, (add LR)> {
+  let isAllocatable = 0;
+}
+def LR8RC : RegisterClass<"PPC", [i64], 64, (add LR8)> {
+  let isAllocatable = 0;
+}
+
 def VRSAVERC : RegisterClass<"PPC", [i32], 32, (add VRSAVE)>;
 def CARRYRC : RegisterClass<"PPC", [i32], 32, (add CARRY, XER)> {
   let CopyCost = -1;
diff --git a/llvm/test/CodeGen/PowerPC/ppc64-inlineasm-clobber.ll b/llvm/test/CodeGen/PowerPC/ppc64-inlineasm-clobber.ll
new file mode 100644
index 000000000000..3d66683c4c75
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/ppc64-inlineasm-clobber.ll
@@ -0,0 +1,264 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=powerpc64le-unknown-linux-unknown -verify-machineinstrs %s \
+; RUN: -ppc-asm-full-reg-names -o - | FileCheck %s --check-prefix=PPC64LE
+; RUN: llc -mtriple=powerpc64-unknown-linux-unknown -verify-machineinstrs %s \
+; RUN: -ppc-asm-full-reg-names -o - | FileCheck %s --check-prefix=PPC64BE
+
+define dso_local void @ClobberLR() local_unnamed_addr #0 {
+; PPC64LE-LABEL: ClobberLR:
+; PPC64LE:       # %bb.0: # %entry
+; PPC64LE-NEXT:    mflr r0
+; PPC64LE-NEXT:    std r0, 16(r1)
+; PPC64LE-NEXT:    stdu r1, -32(r1)
+; PPC64LE-NEXT:    #APP
+; PPC64LE-NEXT:    #NO_APP
+; PPC64LE-NEXT:    addi r1, r1, 32
+; PPC64LE-NEXT:    ld r0, 16(r1)
+; PPC64LE-NEXT:    mtlr r0
+; PPC64LE-NEXT:    blr
+;
+; PPC64BE-LABEL: ClobberLR:
+; PPC64BE:       # %bb.0: # %entry
+; PPC64BE-NEXT:    mflr r0
+; PPC64BE-NEXT:    std r0, 16(r1)
+; PPC64BE-NEXT:    stdu r1, -48(r1)
+; PPC64BE-NEXT:    #APP
+; PPC64BE-NEXT:    #NO_APP
+; PPC64BE-NEXT:    addi r1, r1, 48
+; PPC64BE-NEXT:    ld r0, 16(r1)
+; PPC64BE-NEXT:    mtlr r0
+; PPC64BE-NEXT:    blr
+entry:
+  tail call void asm sideeffect "", "~{lr}"()
+  ret void
+}
+
+define dso_local void @ClobberR5() local_unnamed_addr #0 {
+; PPC64LE-LABEL: ClobberR5:
+; PPC64LE:       # %bb.0: # %entry
+; PPC64LE-NEXT:    #APP
+; PPC64LE-NEXT:    #NO_APP
+; PPC64LE-NEXT:    blr
+;
+; PPC64BE-LABEL: ClobberR5:
+; PPC64BE:       # %bb.0: # %entry
+; PPC64BE-NEXT:    #APP
+; PPC64BE-NEXT:    #NO_APP
+; PPC64BE-NEXT:    blr
+entry:
+  tail call void asm sideeffect "", "~{r5}"()
+  ret void
+}
+
+define dso_local void @ClobberR15() local_unnamed_addr #0 {
+; PPC64LE-LABEL: ClobberR15:
+; PPC64LE:       # %bb.0: # %entry
+; PPC64LE-NEXT:    std r15, -136(r1) # 8-byte Folded Spill
+; PPC64LE-NEXT:    #APP
+; PPC64LE-NEXT:    #NO_APP
+; PPC64LE-NEXT:    ld r15, -136(r1) # 8-byte Folded Reload
+; PPC64LE-NEXT:    blr
+;
+; PPC64BE-LABEL: ClobberR15:
+; PPC64BE:       # %bb.0: # %entry
+; PPC64BE-NEXT:    std r15, -136(r1) # 8-byte Folded Spill
+; PPC64BE-NEXT:    #APP
+; PPC64BE-NEXT:    #NO_APP
+; PPC64BE-NEXT:    ld r15, -136(r1) # 8-byte Folded Reload
+; PPC64BE-NEXT:    blr
+entry:
+  tail call void asm sideeffect "", "~{r15}"()
+  ret void
+}
+
+;; Test for INLINEASM_BR
+define dso_local signext i32 @ClobberLR_BR(i32 signext %in) #0 {
+; PPC64LE-LABEL: ClobberLR_BR:
+; PPC64LE:       # %bb.0: # %entry
+; PPC64LE-NEXT:    #APP
+; PPC64LE-NEXT:    nop
+; PPC64LE-NEXT:    #NO_APP
+; PPC64LE-NEXT:  # %bb.1: # %return
+; PPC64LE-NEXT:    extsw r3, r3
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .Ltmp0: # Block address taken
+; PPC64LE-NEXT:  .LBB3_2: # %return_early
+; PPC64LE-NEXT:    mflr r0
+; PPC64LE-NEXT:    std r0, 16(r1)
+; PPC64LE-NEXT:    stdu r1, -32(r1)
+; PPC64LE-NEXT:    li r3, 0
+; PPC64LE-NEXT:    addi r1, r1, 32
+; PPC64LE-NEXT:    ld r0, 16(r1)
+; PPC64LE-NEXT:    mtlr r0
+; PPC64LE-NEXT:    extsw r3, r3
+; PPC64LE-NEXT:    blr
+;
+; PPC64BE-LABEL: ClobberLR_BR:
+; PPC64BE:       # %bb.0: # %entry
+; PPC64BE-NEXT:    #APP
+; PPC64BE-NEXT:    nop
+; PPC64BE-NEXT:    #NO_APP
+; PPC64BE-NEXT:  # %bb.1: # %return
+; PPC64BE-NEXT:    extsw r3, r3
+; PPC64BE-NEXT:    blr
+; PPC64BE-NEXT:  .Ltmp0: # Block address taken
+; PPC64BE-NEXT:  .LBB3_2: # %return_early
+; PPC64BE-NEXT:    mflr r0
+; PPC64BE-NEXT:    std r0, 16(r1)
+; PPC64BE-NEXT:    stdu r1, -48(r1)
+; PPC64BE-NEXT:    li r3, 0
+; PPC64BE-NEXT:    addi r1, r1, 48
+; PPC64BE-NEXT:    ld r0, 16(r1)
+; PPC64BE-NEXT:    mtlr r0
+; PPC64BE-NEXT:    extsw r3, r3
+; PPC64BE-NEXT:    blr
+entry:
+  callbr void asm sideeffect "nop", "X,~{lr}"(i8* blockaddress(@ClobberLR_BR, %return_early))
+          to label %return [label %return_early]
+
+return_early:
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ 0, %return_early ], [ %in, %entry ]
+  ret i32 %retval.0
+}
+
+define dso_local signext i32 @ClobberR5_BR(i32 signext %in) #0 {
+; PPC64LE-LABEL: ClobberR5_BR:
+; PPC64LE:       # %bb.0: # %entry
+; PPC64LE-NEXT:    #APP
+; PPC64LE-NEXT:    nop
+; PPC64LE-NEXT:    #NO_APP
+; PPC64LE-NEXT:  # %bb.1: # %return
+; PPC64LE-NEXT:    extsw r3, r3
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .Ltmp1: # Block address taken
+; PPC64LE-NEXT:  .LBB4_2: # %return_early
+; PPC64LE-NEXT:    li r3, 0
+; PPC64LE-NEXT:    extsw r3, r3
+; PPC64LE-NEXT:    blr
+;
+; PPC64BE-LABEL: ClobberR5_BR:
+; PPC64BE:       # %bb.0: # %entry
+; PPC64BE-NEXT:    #APP
+; PPC64BE-NEXT:    nop
+; PPC64BE-NEXT:    #NO_APP
+; PPC64BE-NEXT:  # %bb.1: # %return
+; PPC64BE-NEXT:    extsw r3, r3
+; PPC64BE-NEXT:    blr
+; PPC64BE-NEXT:  .Ltmp1: # Block address taken
+; PPC64BE-NEXT:  .LBB4_2: # %return_early
+; PPC64BE-NEXT:    li r3, 0
+; PPC64BE-NEXT:    extsw r3, r3
+; PPC64BE-NEXT:    blr
+entry:
+  callbr void asm sideeffect "nop", "X,~{r5}"(i8* blockaddress(@ClobberR5_BR, %return_early))
+          to label %return [label %return_early]
+
+return_early:
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ 0, %return_early ], [ %in, %entry ]
+  ret i32 %retval.0
+}
+
+
+
+define dso_local void @DefLR() local_unnamed_addr #0 {
+; PPC64LE-LABEL: DefLR:
+; PPC64LE:       # %bb.0: # %entry
+; PPC64LE-NEXT:    mflr r0
+; PPC64LE-NEXT:    std r0, 16(r1)
+; PPC64LE-NEXT:    stdu r1, -32(r1)
+; PPC64LE-NEXT:    #APP
+; PPC64LE-NEXT:    #NO_APP
+; PPC64LE-NEXT:    addi r1, r1, 32
+; PPC64LE-NEXT:    ld r0, 16(r1)
+; PPC64LE-NEXT:    mtlr r0
+; PPC64LE-NEXT:    blr
+;
+; PPC64BE-LABEL: DefLR:
+; PPC64BE:       # %bb.0: # %entry
+; PPC64BE-NEXT:    mflr r0
+; PPC64BE-NEXT:    std r0, 16(r1)
+; PPC64BE-NEXT:    stdu r1, -48(r1)
+; PPC64BE-NEXT:    #APP
+; PPC64BE-NEXT:    #NO_APP
+; PPC64BE-NEXT:    addi r1, r1, 48
+; PPC64BE-NEXT:    ld r0, 16(r1)
+; PPC64BE-NEXT:    mtlr r0
+; PPC64BE-NEXT:    blr
+entry:
+  tail call i64 asm sideeffect "", "={lr}"()
+  ret void
+}
+
+define dso_local void @EarlyClobberLR() local_unnamed_addr #0 {
+; PPC64LE-LABEL: EarlyClobberLR:
+; PPC64LE:       # %bb.0: # %entry
+; PPC64LE-NEXT:    mflr r0
+; PPC64LE-NEXT:    std r0, 16(r1)
+; PPC64LE-NEXT:    stdu r1, -32(r1)
+; PPC64LE-NEXT:    #APP
+; PPC64LE-NEXT:    #NO_APP
+; PPC64LE-NEXT:    addi r1, r1, 32
+; PPC64LE-NEXT:    ld r0, 16(r1)
+; PPC64LE-NEXT:    mtlr r0
+; PPC64LE-NEXT:    blr
+;
+; PPC64BE-LABEL: EarlyClobberLR:
+; PPC64BE:       # %bb.0: # %entry
+; PPC64BE-NEXT:    mflr r0
+; PPC64BE-NEXT:    std r0, 16(r1)
+; PPC64BE-NEXT:    stdu r1, -48(r1)
+; PPC64BE-NEXT:    #APP
+; PPC64BE-NEXT:    #NO_APP
+; PPC64BE-NEXT:    addi r1, r1, 48
+; PPC64BE-NEXT:    ld r0, 16(r1)
+; PPC64BE-NEXT:    mtlr r0
+; PPC64BE-NEXT:    blr
+entry:
+  tail call i64 asm sideeffect "", "=&{lr}"()
+  ret void
+}
+
+define dso_local void @ClobberMulti() local_unnamed_addr #0 {
+; PPC64LE-LABEL: ClobberMulti:
+; PPC64LE:       # %bb.0: # %entry
+; PPC64LE-NEXT:    mflr r0
+; PPC64LE-NEXT:    std r15, -136(r1) # 8-byte Folded Spill
+; PPC64LE-NEXT:    std r16, -128(r1) # 8-byte Folded Spill
+; PPC64LE-NEXT:    std r0, 16(r1)
+; PPC64LE-NEXT:    stdu r1, -176(r1)
+; PPC64LE-NEXT:    #APP
+; PPC64LE-NEXT:    #NO_APP
+; PPC64LE-NEXT:    addi r1, r1, 176
+; PPC64LE-NEXT:    ld r0, 16(r1)
+; PPC64LE-NEXT:    ld r16, -128(r1) # 8-byte Folded Reload
+; PPC64LE-NEXT:    ld r15, -136(r1) # 8-byte Folded Reload
+; PPC64LE-NEXT:    mtlr r0
+; PPC64LE-NEXT:    blr
+;
+; PPC64BE-LABEL: ClobberMulti:
+; PPC64BE:       # %bb.0: # %entry
+; PPC64BE-NEXT:    mflr r0
+; PPC64BE-NEXT:    std r0, 16(r1)
+; PPC64BE-NEXT:    stdu r1, -192(r1)
+; PPC64BE-NEXT:    std r15, 56(r1) # 8-byte Folded Spill
+; PPC64BE-NEXT:    std r16, 64(r1) # 8-byte Folded Spill
+; PPC64BE-NEXT:    #APP
+; PPC64BE-NEXT:    #NO_APP
+; PPC64BE-NEXT:    ld r16, 64(r1) # 8-byte Folded Reload
+; PPC64BE-NEXT:    ld r15, 56(r1) # 8-byte Folded Reload
+; PPC64BE-NEXT:    addi r1, r1, 192
+; PPC64BE-NEXT:    ld r0, 16(r1)
+; PPC64BE-NEXT:    mtlr r0
+; PPC64BE-NEXT:    blr
+entry:
+  tail call void asm sideeffect "", "~{lr},~{r15},~{r16}"()
+  ret void
+}
+
+attributes #0 = { nounwind }

From e673593742e7dc1f4141d0b6391b995d5152fa6c Mon Sep 17 00:00:00 2001
From: mydeveloperday <mydeveloperday@gmail.com>
Date: Sat, 15 May 2021 11:29:56 +0100
Subject: [PATCH 256/318] [clang-format]  PR50326 AlignAfterOpenBracket
 AlwaysBreak does not keep to the ColumnLimit

https://bugs.llvm.org/show_bug.cgi?id=50326

{D93626} caused a regression in terms of formatting a function ptr, incorrectly thinking it was a C-Style cast.

This cased a formatter regression between clang-format-11 and clang-format-12

```
void bar()
{
    size_t foo = function(Foooo, Barrrrr, Foooo, Barrrr, FoooooooooLooooong);

    size_t foo = function(
        Foooo, Barrrrr, Foooo, Barrrr, FoooooooooLooooong, BarrrrrrrrrrrrLong,
        FoooooooooLooooong);

    size_t foo = (*(function))(Foooo, Barrrrr, Foooo, FoooooooooLooooong);

    size_t foo = (*(
        function))(Foooo, Barrrrr, Foooo, Barrrr, FoooooooooLooooong,
        BarrrrrrrrrrrrLong, FoooooooooLooooong);
}
```

became

```
void bar()
{
    size_t foo1 = function(Foooo, Barrrrr, Foooo, Barrrr, FoooooooooLooooong);

    size_t foo2 = function(
        Foooo, Barrrrr, Foooo, Barrrr, FoooooooooLooooong, BarrrrrrrrrrrrLong,
        FoooooooooLooooong);

    size_t foo3 = (*(function))(Foooo, Barrrrr, Foooo, FoooooooooLooooong);

    size_t foo4 = (*(
        function))(Foooo, Barrrrr, Foooo, Barrrr, FoooooooooLooooong, BarrrrrrrrrrrrLong, FoooooooooLooooong);
}
```

This fixes this issue by simplify the clause to be specific about what is wanted rather than what is not.

Reviewed By: curdeius, HazardyKnusperkeks

Differential Revision: https://reviews.llvm.org/D102392

(cherry picked from commit eae445f65d077304703e3290ddb4ff28f6d65ff4)
---
 clang/lib/Format/TokenAnnotator.cpp   | 12 ++++++------
 clang/unittests/Format/FormatTest.cpp | 20 ++++++++++++++++++++
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 34c291ecc492..82d6cfed308d 100755
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -1917,12 +1917,12 @@ class AnnotatingParser {
     if (Tok.Next->isOneOf(tok::identifier, tok::kw_this))
       return true;
 
-    if (Tok.Next->is(tok::l_paren) &&
-        !(Tok.Previous && Tok.Previous->is(tok::identifier) &&
-          Tok.Previous->Previous &&
-          Tok.Previous->Previous->isOneOf(tok::arrowstar, tok::arrow,
-                                          tok::star)))
-      return true;
+    // Look for a cast `( x ) (`.
+    if (Tok.Next->is(tok::l_paren) && Tok.Previous && Tok.Previous->Previous) {
+      if (Tok.Previous->is(tok::identifier) &&
+          Tok.Previous->Previous->is(tok::l_paren))
+        return true;
+    }
 
     if (!Tok.Next->Next)
       return false;
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index c1f88b9ae17a..ed26bb8a7150 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -12368,6 +12368,17 @@ TEST_F(FormatTest, ConfigurableSpacesInParentheses) {
   verifyFormat("size_t idx = (a->*foo)(a - 1);", Spaces);
   verifyFormat("size_t idx = (a->foo)(a - 1);", Spaces);
   verifyFormat("size_t idx = (*foo)(a - 1);", Spaces);
+  verifyFormat("size_t idx = (*(foo))(a - 1);", Spaces);
+  Spaces.ColumnLimit = 80;
+  Spaces.IndentWidth = 4;
+  Spaces.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak;
+  verifyFormat("void foo( ) {\n"
+               "    size_t foo = (*(function))(\n"
+               "        Foooo, Barrrrr, Foooo, Barrrr, FoooooooooLooooong, "
+               "BarrrrrrrrrrrrLong,\n"
+               "        FoooooooooLooooong);\n"
+               "}",
+               Spaces);
   Spaces.SpaceAfterCStyleCast = false;
   verifyFormat("size_t idx = (size_t)(ptr - ((char *)file));", Spaces);
   verifyFormat("size_t idx = (size_t)a;", Spaces);
@@ -12375,6 +12386,15 @@ TEST_F(FormatTest, ConfigurableSpacesInParentheses) {
   verifyFormat("size_t idx = (a->*foo)(a - 1);", Spaces);
   verifyFormat("size_t idx = (a->foo)(a - 1);", Spaces);
   verifyFormat("size_t idx = (*foo)(a - 1);", Spaces);
+  verifyFormat("size_t idx = (*(foo))(a - 1);", Spaces);
+
+  verifyFormat("void foo( ) {\n"
+               "    size_t foo = (*(function))(\n"
+               "        Foooo, Barrrrr, Foooo, Barrrr, FoooooooooLooooong, "
+               "BarrrrrrrrrrrrLong,\n"
+               "        FoooooooooLooooong);\n"
+               "}",
+               Spaces);
 }
 
 TEST_F(FormatTest, ConfigurableSpacesInSquareBrackets) {

From 6279fd114acba298b9b743755d7031d615c92d23 Mon Sep 17 00:00:00 2001
From: Qiu Chaofan <qiucofan@cn.ibm.com>
Date: Fri, 28 May 2021 00:10:04 +0800
Subject: [PATCH 257/318] [SPE] Disable strict-fp for SPE by default

As discussed in PR50385, strict-fp on PowerPC SPE has not been handled
well. This patch disables it by default for SPE.

Reviewed By: nemanjai, vit9696, jhibbits

Differential Revision: https://reviews.llvm.org/D103235

(cherry picked from commit 5c18d1136665f74b15c0df599f56ac3e2e947fb8)
---
 clang/lib/Basic/Targets/PPC.cpp                 | 1 +
 clang/test/CodeGen/builtins-ppc-fpconstrained.c | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/clang/lib/Basic/Targets/PPC.cpp b/clang/lib/Basic/Targets/PPC.cpp
index ff09c0fa2a23..6c3036836c6d 100644
--- a/clang/lib/Basic/Targets/PPC.cpp
+++ b/clang/lib/Basic/Targets/PPC.cpp
@@ -57,6 +57,7 @@ bool PPCTargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
     } else if (Feature == "+pcrelative-memops") {
       HasPCRelativeMemops = true;
     } else if (Feature == "+spe" || Feature == "+efpu2") {
+      HasStrictFP = false;
       HasSPE = true;
       LongDoubleWidth = LongDoubleAlign = 64;
       LongDoubleFormat = &llvm::APFloat::IEEEdouble();
diff --git a/clang/test/CodeGen/builtins-ppc-fpconstrained.c b/clang/test/CodeGen/builtins-ppc-fpconstrained.c
index 880c0c339ef3..909210996064 100644
--- a/clang/test/CodeGen/builtins-ppc-fpconstrained.c
+++ b/clang/test/CodeGen/builtins-ppc-fpconstrained.c
@@ -11,6 +11,9 @@
 // RUN: -fallow-half-arguments-and-returns -S -ffp-exception-behavior=strict \
 // RUN: -o - %s | FileCheck --check-prefix=CHECK-ASM \
 // RUN: --check-prefix=FIXME-CHECK  %s
+// RUN: %clang_cc1 -triple powerpcspe -S -ffp-exception-behavior=strict \
+// RUN: -target-feature +spe -fexperimental-strict-floating-point -emit-llvm \
+// RUN: %s -o - | FileCheck --check-prefix=CHECK-CONSTRAINED %s
 
 typedef __attribute__((vector_size(4 * sizeof(float)))) float vec_float;
 typedef __attribute__((vector_size(2 * sizeof(double)))) double vec_double;

From f1b1151b61b19391b083ced5d4b0b710ada07f6c Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald@gigawatt.nl>
Date: Mon, 31 May 2021 12:36:57 -0400
Subject: [PATCH 258/318] [libc++] [test] Fix a few tests for 32-bit x86

Fixes bug https://llvm.org/PR48939.

Differential Revision: https://reviews.llvm.org/D102359

(cherry picked from commit 73cdc7599bf053c3e22ce6bb15a4266f66bd5e69)
---
 libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp  | 10 +++++-----
 .../libcxx/memory/trivial_abi/unique_ptr_ret.pass.cpp  |  4 ++++
 .../libcxx/memory/trivial_abi/weak_ptr_ret.pass.cpp    |  5 +++--
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp b/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp
index fed6a1618483..cf560ce31097 100644
--- a/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp
+++ b/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp
@@ -427,9 +427,9 @@ void multiset_test() {
 
 void vector_test() {
   std::vector<bool> test0 = {true, false};
-  ComparePrettyPrintToChars(test0,
+  ComparePrettyPrintToRegex(test0,
                             "std::vector<bool> of "
-                            "length 2, capacity 64 = {1, 0}");
+                            "length 2, capacity (32|64) = {1, 0}");
   for (int i = 0; i < 31; ++i) {
     test0.push_back(true);
     test0.push_back(false);
@@ -444,9 +444,9 @@ void vector_test() {
   ComparePrettyPrintToRegex(
       test0,
       "std::vector<bool> of length 65, "
-      "capacity 128 = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, "
-      "1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, "
-      "1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}");
+      "capacity (96|128) = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, "
+      "0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, "
+      "0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}");
 
   std::vector<int> test1;
   ComparePrettyPrintToChars(test1, "std::vector of length 0, capacity 0");
diff --git a/libcxx/test/libcxx/memory/trivial_abi/unique_ptr_ret.pass.cpp b/libcxx/test/libcxx/memory/trivial_abi/unique_ptr_ret.pass.cpp
index 73fe18becc3a..5cda71371fa6 100644
--- a/libcxx/test/libcxx/memory/trivial_abi/unique_ptr_ret.pass.cpp
+++ b/libcxx/test/libcxx/memory/trivial_abi/unique_ptr_ret.pass.cpp
@@ -46,7 +46,11 @@ int main(int, char**) {
   //
   // With trivial_abi, local_addr is the address of a local variable in
   // make_val, and hence different from &ret.
+#if !defined(__i386__)
+  // On X86, structs are never returned in registers.
+  // Thus, unique_ptr will be passed indirectly even if it is trivial.
   assert((void*)&ret != local_addr);
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/libcxx/memory/trivial_abi/weak_ptr_ret.pass.cpp b/libcxx/test/libcxx/memory/trivial_abi/weak_ptr_ret.pass.cpp
index e69c94506f2a..9b4e95e249e2 100644
--- a/libcxx/test/libcxx/memory/trivial_abi/weak_ptr_ret.pass.cpp
+++ b/libcxx/test/libcxx/memory/trivial_abi/weak_ptr_ret.pass.cpp
@@ -49,9 +49,10 @@ int main(int, char**) {
   //
   // With trivial_abi, local_addr is the address of a local variable in
   // make_val, and hence different from &ret.
-#ifndef __arm__
+#if !defined(__i386__) && !defined(__arm__)
+  // On X86, structs are never returned in registers.
   // On ARM32, structs larger than 4 bytes cannot be returned in registers.
-  // Thus, weak_ptr will be passed indrectly even if it is trivial.
+  // Thus, weak_ptr will be passed indirectly even if it is trivial.
   assert((void*)&ret != local_addr);
 #endif
   return 0;

From 6a86669a6d99179997feaa015b92423cba64ad96 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Wed, 19 May 2021 11:22:25 -0700
Subject: [PATCH 259/318] [WebAssembly] Ignore filters in Emscripten EH
 landingpads

We have been handling filters and landingpads incorrectly all along. We
pass clauses' (catches') types to `__cxa_find_matching_catch` in JS glue
code, which returns the thrown pointer and sets the selector using
`setTempRet0()`.

We apparently have been doing the same for filters' (exception specs')
types; we pass them to `__cxa_find_matching_catch` just the same way as
clauses. And `__cxa_find_matching_catch` treats all given types as
clauses. So it is a little surprising; maybe we intended to do something
from the JS side and didn't end up doing?

So anyway, I don't think supporting exception specs in Emscripten EH is
a priority, but this can actually cause incorrect results for normal
catches when functions are inlined and the inlined spec type has a
parent-child relationship with the catch's type.

---

The below is an example of a bug that can happen when inlining and class
hierarchy is mixed. If you are busy you can skip this part:
```
struct A {};
struct B : A {};

void bar() throw (B) { throw B(); }

void foo() {
  try {
    bar();
  } catch (A &) {
    fputs ("Expected result\n", stdout);
  }
}
```

In the unoptimized code, `bar`'s landingpad will have a filter for `B`
and `foo`'s landingpad will have a clause for `A`. But when `bar` is
inlined into `foo`, `foo`'s landingpad has both a filter for `B` and a
clause for `A`, and it passes the both types to
`__cxa_find_matching_catch`:
```
__cxa_find_matching_catch(typeinfo for B, typeinfo for A)
```
`__cxa_find_matching_catch` thinks both are clauses, and looks at the
first type `B`, which belongs to a filter. And the thrown type is `B`,
so it thinks the first type `B` is caught. But this makes it return an
incorrect selector, because it is supposed to catch the exception using
the second type `A`, which is a parent of `B`. As a result, the `foo` in
the example program above does not print "Expected result" but just
throws the exception to the caller. (This wouldn't have happened if `A`
and `B` are completely disjoint types, such as `float` and `int`)

Fixes https://bugs.llvm.org/show_bug.cgi?id=50357.

Reviewed By: dschuff, kripken

Differential Revision: https://reviews.llvm.org/D102795

(cherry picked from commit 412a3381f721452fb6cd33bc30e7700102639e3f)
---
 .../WebAssemblyLowerEmscriptenEHSjLj.cpp            | 13 +++----------
 .../test/CodeGen/WebAssembly/lower-em-exceptions.ll | 12 ++++++------
 2 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index d3bbadf27478..ff6404c30971 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -885,16 +885,9 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runEHOnFunction(Function &F) {
     SmallVector<Value *, 16> FMCArgs;
     for (unsigned I = 0, E = LPI->getNumClauses(); I < E; ++I) {
       Constant *Clause = LPI->getClause(I);
-      // As a temporary workaround for the lack of aggregate varargs support
-      // in the interface between JS and wasm, break out filter operands into
-      // their component elements.
-      if (LPI->isFilter(I)) {
-        auto *ATy = cast<ArrayType>(Clause->getType());
-        for (unsigned J = 0, E = ATy->getNumElements(); J < E; ++J) {
-          Value *EV = IRB.CreateExtractValue(Clause, makeArrayRef(J), "filter");
-          FMCArgs.push_back(EV);
-        }
-      } else
+      // TODO Handle filters (= exception specifications).
+      // https://bugs.llvm.org/show_bug.cgi?id=50396
+      if (LPI->isCatch(I))
         FMCArgs.push_back(Clause);
     }
 
diff --git a/llvm/test/CodeGen/WebAssembly/lower-em-exceptions.ll b/llvm/test/CodeGen/WebAssembly/lower-em-exceptions.ll
index 88073f3a926f..b2e344e88b33 100644
--- a/llvm/test/CodeGen/WebAssembly/lower-em-exceptions.ll
+++ b/llvm/test/CodeGen/WebAssembly/lower-em-exceptions.ll
@@ -68,6 +68,9 @@ catch:                                            ; preds = %catch.dispatch
 }
 
 ; Test invoke instruction with filters (functions with throw(...) declaration)
+; Currently we don't support exception specifications correctly in JS glue code,
+; so we ignore all filters here.
+; See https://bugs.llvm.org/show_bug.cgi?id=50396.
 define void @filter() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
 ; CHECK-LABEL: @filter(
 entry:
@@ -91,12 +94,9 @@ lpad:                                             ; preds = %entry
   %2 = extractvalue { i8*, i32 } %0, 1
   br label %filter.dispatch
 ; CHECK: lpad:
-; CHECK-NEXT: %[[FMC:.*]] = call i8* @__cxa_find_matching_catch_4(i8* bitcast (i8** @_ZTIi to i8*), i8* bitcast (i8** @_ZTIc to i8*))
-; CHECK-NEXT: %[[IVI1:.*]] = insertvalue { i8*, i32 } undef, i8* %[[FMC]], 0
-; CHECK-NEXT: %[[TEMPRET0_VAL:.*]] = call i32 @getTempRet0()
-; CHECK-NEXT: %[[IVI2:.*]] = insertvalue { i8*, i32 } %[[IVI1]], i32 %[[TEMPRET0_VAL]], 1
-; CHECK-NEXT: extractvalue { i8*, i32 } %[[IVI2]], 0
-; CHECK-NEXT: extractvalue { i8*, i32 } %[[IVI2]], 1
+; We now temporarily ignore filters because of the bug, so we pass nothing to
+; __cxa_find_matching_catch
+; CHECK-NEXT: %[[FMC:.*]] = call i8* @__cxa_find_matching_catch_2()
 
 filter.dispatch:                                  ; preds = %lpad
   %ehspec.fails = icmp slt i32 %2, 0

From 0826268d59c6e1bb3530dffd9dc5f6038774486d Mon Sep 17 00:00:00 2001
From: Qiu Chaofan <qiucofan@cn.ibm.com>
Date: Tue, 1 Jun 2021 01:15:49 +0800
Subject: [PATCH 260/318] [PowerPC] Fix x86 vector intrinsics wrapper
 compilation under C++

Reviewed By: nemanjai

Differential Revision: https://reviews.llvm.org/D103386

(cherry picked from commit c0b3071833a80121a5a7ca9ea54fd59a59806acc)
---
 clang/lib/Headers/ppc_wrappers/xmmintrin.h | 9 ++++-----
 clang/test/CodeGen/ppc-xmmintrin.c         | 6 +++++-
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/clang/lib/Headers/ppc_wrappers/xmmintrin.h b/clang/lib/Headers/ppc_wrappers/xmmintrin.h
index 0f429fa04081..0e45b96769f8 100644
--- a/clang/lib/Headers/ppc_wrappers/xmmintrin.h
+++ b/clang/lib/Headers/ppc_wrappers/xmmintrin.h
@@ -28,7 +28,7 @@
    Most SSE scalar float intrinsic operations can be performed more
    efficiently as C language float scalar operations or optimized to
    use vector SIMD operations. We recommend this for new applications. */
-#error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
+#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
 #endif
 
 #ifndef _XMMINTRIN_H_INCLUDED
@@ -62,14 +62,13 @@
 
 /* The Intel API is flexible enough that we must allow aliasing with other
    vector types, and their scalar components.  */
-typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
+typedef vector float __m128 __attribute__((__may_alias__));
 
 /* Unaligned version of the same type.  */
-typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__,
-				       __aligned__ (1)));
+typedef vector float __m128_u __attribute__((__may_alias__, __aligned__(1)));
 
 /* Internal data types for implementing the intrinsics.  */
-typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+typedef vector float __v4sf;
 
 /* Create an undefined vector.  */
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
diff --git a/clang/test/CodeGen/ppc-xmmintrin.c b/clang/test/CodeGen/ppc-xmmintrin.c
index e9466b32257f..32a5e5a7cdaf 100644
--- a/clang/test/CodeGen/ppc-xmmintrin.c
+++ b/clang/test/CodeGen/ppc-xmmintrin.c
@@ -3,8 +3,12 @@
 
 // RUN: %clang -S -emit-llvm -target powerpc64-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
 // RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-BE
+// RUN: %clang -x c++ -fsyntax-only -target powerpc64-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns
 // RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
 // RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-LE
+// RUN: %clang -x c++ -fsyntax-only -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns
 
 #include <xmmintrin.h>
 
@@ -1426,7 +1430,7 @@ test_mul() {
 
 void __attribute__((noinline))
 test_prefetch() {
-  _mm_prefetch(ms, i);
+  _mm_prefetch(ms, _MM_HINT_NTA);
 }
 
 // CHECK-LABEL: @test_prefetch

From cde86632a772b523ba3db7039f75d979f557b57c Mon Sep 17 00:00:00 2001
From: Sterling Augustine <saugustine@google.com>
Date: Fri, 5 Feb 2021 10:59:37 -0800
Subject: [PATCH 261/318] Various minor fixes for python 3

Switch StdTuple printer from python 2-style "next" to python 3.

Nested iteration changed enough to make the original bitset iteration
code a bit trickier than it needs to be, so unnest.

The end node of a map iterator is sometimes hard to detect in isolation,
don't fail in that case.

Differential Revision: https://reviews.llvm.org/D96167

(cherry picked from commit a34b8b879e345397880c1f9f8de4c294dd0b370c)
---
 .../libcxx/gdb/gdb_pretty_printer_test.sh.cpp |  5 ++--
 libcxx/utils/gdb/libcxx/printers.py           | 23 +++++++------------
 2 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp b/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp
index cf560ce31097..d8a02a384563 100644
--- a/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp
+++ b/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp
@@ -489,8 +489,9 @@ void map_iterator_test() {
 
   auto not_found = one_two_three.find(7);
   MarkAsLive(not_found);
-  CompareExpressionPrettyPrintToRegex("not_found",
-      R"(std::__map_iterator  = {\[0x[a-f0-9]+\] =  end\(\)})");
+  // Because the end_node is not easily detected, just be sure it doesn't crash.
+  CompareExpressionPrettyPrintToRegex(
+      "not_found", R"(std::__map_iterator ( = {\[0x[a-f0-9]+\] = .*}|<error reading variable:.*>))");
 }
 
 void unordered_set_test() {
diff --git a/libcxx/utils/gdb/libcxx/printers.py b/libcxx/utils/gdb/libcxx/printers.py
index 0ee446f46c51..9b413a86b159 100644
--- a/libcxx/utils/gdb/libcxx/printers.py
+++ b/libcxx/utils/gdb/libcxx/printers.py
@@ -13,6 +13,7 @@
 
 from __future__ import print_function
 
+import math
 import re
 import gdb
 
@@ -141,7 +142,7 @@ def __iter__(self):
 
         def __next__(self):
             # child_iter raises StopIteration when appropriate.
-            field_name = self.child_iter.next()
+            field_name = next(self.child_iter)
             child = self.val["__base_"][field_name]["__value_"]
             self.count += 1
             return ("[%d]" % self.count, child)
@@ -425,6 +426,7 @@ def __init__(self, val):
         self.val = val
         self.n_words = int(self.val["__n_words"])
         self.bits_per_word = int(self.val["__bits_per_word"])
+        self.bit_count = self.val.type.template_argument(0)
         if self.n_words == 1:
             self.values = [int(self.val["__first_"])]
         else:
@@ -435,21 +437,12 @@ def to_string(self):
         typename = _prettify_typename(self.val.type)
         return "%s" % typename
 
-    def _byte_it(self, value):
-        index = -1
-        while value:
-            index += 1
-            will_yield = value % 2
-            value /= 2
-            if will_yield:
-                yield index
-
     def _list_it(self):
-        for word_index in range(self.n_words):
-            current = self.values[word_index]
-            if current:
-                for n in self._byte_it(current):
-                    yield ("[%d]" % (word_index * self.bits_per_word + n), 1)
+        for bit in range(self.bit_count):
+            word = math.floor(bit / self.bits_per_word)
+            word_bit = bit % self.bits_per_word
+            if self.values[word] & (1 << word_bit):
+                yield ("[%d]" % bit, 1)
 
     def __iter__(self):
         return self._list_it()

From cf3e126e6449255ef183df71f5c54020d7a87ee5 Mon Sep 17 00:00:00 2001
From: Petr Hosek <phosek@google.com>
Date: Mon, 29 Mar 2021 12:10:18 -0700
Subject: [PATCH 262/318] [libcxx] Make the GDB pretty printer test less strict

This is a workaround for PR48937. GDB can sometimes print additional
warnings which currently fails the test. Use re.search instead of
re.match to ignore this additional output.

Differential Revision: https://reviews.llvm.org/D99532

(cherry picked from commit 9ac988f6a80aa1dd25594a8b4c86c0380ac99466)
---
 libcxx/test/libcxx/gdb/gdb_pretty_printer_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.py b/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.py
index af473c48ea9a..19e9a4a793bf 100644
--- a/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.py
+++ b/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.py
@@ -49,7 +49,7 @@ def invoke(self, arg, from_tty):
             expectation_val = compare_frame.read_var("expectation")
             check_literal = expectation_val.string(encoding="utf-8")
             if "PrettyPrintToRegex" in compare_frame.name():
-                test_fails = not re.match(check_literal, value)
+                test_fails = not re.search(check_literal, value)
             else:
                 test_fails = value != check_literal
 

From bf25180e6727f53b916c73e53212b274fe64ded6 Mon Sep 17 00:00:00 2001
From: Sterling Augustine <saugustine@google.com>
Date: Thu, 15 Apr 2021 18:03:01 -0700
Subject: [PATCH 263/318] Tolerate missing debug info in the shared_ptr pretty
 printer.

Certain fields of shared ptr have virtual functions and therefore
have their debug info homed in libc++. But if libc++ wasn't built
with debug info, the pretty printer would fail.

This patch makes the pretty printer tolerate such conditions and
updates the test harness.

This patch significantly reworks a previous attempt.

This addresses https://bugs.llvm.org/show_bug.cgi?id=48937

Differential Revision: https://reviews.llvm.org/D100610

(cherry picked from commit 55b7061116b5f0f839bd4240c3c6fba63918b816)
---
 .../libcxx/gdb/gdb_pretty_printer_test.py     |  1 -
 .../libcxx/gdb/gdb_pretty_printer_test.sh.cpp | 10 +++++----
 libcxx/utils/gdb/libcxx/printers.py           | 21 +++++++++++++------
 3 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.py b/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.py
index 19e9a4a793bf..b9d00371418e 100644
--- a/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.py
+++ b/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.py
@@ -45,7 +45,6 @@ def invoke(self, arg, from_tty):
             # Ignore the convenience variable name and newline
             value = value_str[value_str.find("= ") + 2:-1]
             gdb.newest_frame().select()
-
             expectation_val = compare_frame.read_var("expectation")
             check_literal = expectation_val.string(encoding="utf-8")
             if "PrettyPrintToRegex" in compare_frame.name():
diff --git a/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp b/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp
index d8a02a384563..2d8e9620089a 100644
--- a/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp
+++ b/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp
@@ -608,25 +608,27 @@ void shared_ptr_test() {
   // due to which there is one more count for the pointer. Hence, all the
   // following tests are testing with expected count plus 1.
   std::shared_ptr<const int> test0 = std::make_shared<const int>(5);
+  // The python regular expression matcher treats newlines as significant, so
+  // these regular expressions should be on one line.
   ComparePrettyPrintToRegex(
       test0,
-      R"(std::shared_ptr<int> count 2, weak 0 containing = {__ptr_ = 0x[a-f0-9]+})");
+      R"(std::shared_ptr<int> count [2\?], weak [0\?]( \(libc\+\+ missing debug info\))? containing = {__ptr_ = 0x[a-f0-9]+})");
 
   std::shared_ptr<const int> test1(test0);
   ComparePrettyPrintToRegex(
       test1,
-      R"(std::shared_ptr<int> count 3, weak 0 containing = {__ptr_ = 0x[a-f0-9]+})");
+      R"(std::shared_ptr<int> count [3\?], weak [0\?]( \(libc\+\+ missing debug info\))? containing = {__ptr_ = 0x[a-f0-9]+})");
 
   {
     std::weak_ptr<const int> test2 = test1;
     ComparePrettyPrintToRegex(
         test0,
-        R"(std::shared_ptr<int> count 3, weak 1 containing = {__ptr_ = 0x[a-f0-9]+})");
+        R"(std::shared_ptr<int> count [3\?], weak [1\?]( \(libc\+\+ missing debug info\))? containing = {__ptr_ = 0x[a-f0-9]+})");
   }
 
   ComparePrettyPrintToRegex(
       test0,
-      R"(std::shared_ptr<int> count 3, weak 0 containing = {__ptr_ = 0x[a-f0-9]+})");
+      R"(std::shared_ptr<int> count [3\?], weak [0\?]( \(libc\+\+ missing debug info\))? containing = {__ptr_ = 0x[a-f0-9]+})");
 
   std::shared_ptr<const int> test3;
   ComparePrettyPrintToChars(test3, "std::shared_ptr is nullptr");
diff --git a/libcxx/utils/gdb/libcxx/printers.py b/libcxx/utils/gdb/libcxx/printers.py
index 9b413a86b159..9d9a96a3e36f 100644
--- a/libcxx/utils/gdb/libcxx/printers.py
+++ b/libcxx/utils/gdb/libcxx/printers.py
@@ -312,12 +312,21 @@ def to_string(self):
             return "%s is nullptr" % typename
         refcount = self.val["__cntrl_"]
         if refcount != 0:
-            usecount = refcount["__shared_owners_"] + 1
-            weakcount = refcount["__shared_weak_owners_"]
-            if usecount == 0:
-                state = "expired, weak %d" % weakcount
-            else:
-                state = "count %d, weak %d" % (usecount, weakcount)
+            try:
+                usecount = refcount["__shared_owners_"] + 1
+                weakcount = refcount["__shared_weak_owners_"]
+                if usecount == 0:
+                    state = "expired, weak %d" % weakcount
+                else:
+                    state = "count %d, weak %d" % (usecount, weakcount)
+            except:
+                # Debug info for a class with virtual functions is emitted
+                # in the same place as its key function. That means that
+                # for std::shared_ptr, __shared_owners_ is emitted into
+                # into libcxx.[so|a] itself, rather than into the shared_ptr
+                # instantiation point. So if libcxx.so was built without
+                # debug info, these fields will be missing.
+                state = "count ?, weak ? (libc++ missing debug info)"
         return "%s<%s> %s containing" % (typename, pointee_type, state)
 
     def __iter__(self):

From 1539c543dbe5edb9d6809aa8ad60b32dba5888d7 Mon Sep 17 00:00:00 2001
From: Sterling Augustine <saugustine@google.com>
Date: Thu, 15 Apr 2021 18:03:01 -0700
Subject: [PATCH 264/318] Don't fail the shared_ptr test if libc++ has
 insufficient debug info.

Don't fail the shared_ptr test if libc++ has insufficient debug info.

This addresses https://bugs.llvm.org/show_bug.cgi?id=48937

Differential Revision: https://reviews.llvm.org/D100610

(cherry picked from commit 0e8378032597bcaccb948de88e965ad75bfaeb7b)
---
 .../test/libcxx/gdb/gdb_pretty_printer_test.py  | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.py b/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.py
index b9d00371418e..4621e6388343 100644
--- a/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.py
+++ b/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.py
@@ -37,6 +37,23 @@ def invoke(self, arg, from_tty):
             compare_frame = gdb.newest_frame().older()
             testcase_frame = compare_frame.older()
             test_loc = testcase_frame.find_sal()
+
+            expectation_val = compare_frame.read_var("expectation")
+            check_literal = expectation_val.string(encoding="utf-8")
+
+            # Heuristic to determine if libc++ itself has debug
+            # info. If it doesn't, then anything normally homed there
+            # won't be found, and the printer will error. We don't
+            # want to fail the test in this case--the printer itself
+            # is probably fine, or at least we can't tell.
+            if check_literal.startswith("std::shared_ptr"):
+                shared_ptr = compare_frame.read_var("value")
+                if not "__shared_owners_" in shared_ptr.type.fields():
+                    print("IGNORED (no debug info in libc++): " +
+                          test_loc.symtab.filename + ":" +
+                          str(test_loc.line))
+                    return
+
             # Use interactive commands in the correct context to get the pretty
             # printed version
 

From fec90b2cebc3d1bb7d9562bec5170d697db39a84 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dimitry@andric.com>
Date: Tue, 4 May 2021 16:58:54 +0200
Subject: [PATCH 265/318] Reland "[MC][ELF] Work around R_MIPS_LO16 relocation
 handling problem"

This fixes PR49821, and avoids "ld.lld: error: test.o:(.rodata.str1.1):
offset is outside the section" errors when linking MIPS objects with
negative R_MIPS_LO16 implicit addends.

ld.lld handles R_MIPS_HI16/R_MIPS_LO16 separately, not as a whole, so it
doesn't know that an R_MIPS_HI16 with implicit addend 1 and an
R_MIPS_LO16 with implicit addend -32768 represents 32768, which is in
range of a MergeInputSection. We could introduce a new RelExpr member
(like R_RISCV_PC_INDIRECT for R_RISCV_PCREL_HI20 / R_RISCV_PCREL_LO12)
but the complexity is unnecessary given that GNU as keeps the original
symbol for this case as well.

Adds a new test case for PR49821, and also updates two other test cases
that are affected by this change.

Reviewed By: atanasyan, MaskRay

Differential Revision: https://reviews.llvm.org/D101773

(cherry picked from commit 7e83a7f1fdfcc2edde61f0a535f9d7a56f531db9)
---
 llvm/lib/MC/ELFObjectWriter.cpp | 11 +++++++++++
 llvm/test/MC/Mips/elf-relsym.s  | 10 ++++++++--
 llvm/test/MC/Mips/mips_lo16.s   | 22 ++++++++++++++++++++++
 llvm/test/MC/Mips/xgot.s        |  4 ++--
 4 files changed, 43 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/MC/Mips/mips_lo16.s

diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp
index 69307b617552..2d810ffd350b 100644
--- a/llvm/lib/MC/ELFObjectWriter.cpp
+++ b/llvm/lib/MC/ELFObjectWriter.cpp
@@ -1397,6 +1397,17 @@ bool ELFObjectWriter::shouldRelocateWithSymbol(const MCAssembler &Asm,
       if (TargetObjectWriter->getEMachine() == ELF::EM_386 &&
           Type == ELF::R_386_GOTOFF)
         return true;
+
+      // ld.lld handles R_MIPS_HI16/R_MIPS_LO16 separately, not as a whole, so
+      // it doesn't know that an R_MIPS_HI16 with implicit addend 1 and an
+      // R_MIPS_LO16 with implicit addend -32768 represents 32768, which is in
+      // range of a MergeInputSection. We could introduce a new RelExpr member
+      // (like R_RISCV_PC_INDIRECT for R_RISCV_PCREL_HI20 / R_RISCV_PCREL_LO12)
+      // but the complexity is unnecessary given that GNU as keeps the original
+      // symbol for this case as well.
+      if (TargetObjectWriter->getEMachine() == ELF::EM_MIPS &&
+          !hasRelocationAddend())
+        return true;
     }
 
     // Most TLS relocations use a got, so they need the symbol. Even those that
diff --git a/llvm/test/MC/Mips/elf-relsym.s b/llvm/test/MC/Mips/elf-relsym.s
index d19b4e3c4820..b8c2f89e82e6 100644
--- a/llvm/test/MC/Mips/elf-relsym.s
+++ b/llvm/test/MC/Mips/elf-relsym.s
@@ -4,10 +4,16 @@
 
 // CHECK: Symbols [
 // CHECK:   Symbol {
-// CHECK:     Name: .rodata.cst8
+// CHECK:     Name: $.str
 // CHECK:   }
 // CHECK:   Symbol {
-// CHECK:     Name: .rodata.str1.1
+// CHECK:     Name: $.str1
+// CHECK:   }
+// CHECK:   Symbol {
+// CHECK:     Name: $CPI0_0
+// CHECK:   }
+// CHECK:   Symbol {
+// CHECK:     Name: $CPI0_1
 // CHECK:   }
 // CHECK: ]
 
diff --git a/llvm/test/MC/Mips/mips_lo16.s b/llvm/test/MC/Mips/mips_lo16.s
new file mode 100644
index 000000000000..a400f9206116
--- /dev/null
+++ b/llvm/test/MC/Mips/mips_lo16.s
@@ -0,0 +1,22 @@
+# PR49821: Check that R_MIPS_LO16 relocs do not wrap around with large addends.
+
+# RUN: llvm-mc %s -triple mips-unknown-unknown -filetype=obj | \
+# RUN:	 llvm-objdump -d -r --no-show-raw-insn - | \
+# RUN:   FileCheck -check-prefix=MIPS32 %s
+
+# RUN: llvm-mc %s -triple mips64-unknown-unknown -filetype=obj | \
+# RUN:	 llvm-objdump -d -r --no-show-raw-insn - | \
+# RUN:   FileCheck -check-prefix=MIPS64 %s
+
+	.text
+foo:
+	lui	$2, %hi(bar)
+# MIPS32: 00000000:  R_MIPS_HI16  bar
+# MIPS64: 0000000000000000:  R_MIPS_HI16/R_MIPS_NONE/R_MIPS_NONE	.rodata.str1.1+0x8000
+	addiu	$2, $2, %lo(bar)
+# MIPS32: 00000004:  R_MIPS_LO16  bar
+# MIPS64: 0000000000000004:  R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE	.rodata.str1.1+0x8000
+	.section	.rodata.str1.1,"aMS",@progbits,1
+	.zero 0x8000
+bar:
+	.asciz	"hello"
diff --git a/llvm/test/MC/Mips/xgot.s b/llvm/test/MC/Mips/xgot.s
index 0c29582d681c..100d25e67223 100644
--- a/llvm/test/MC/Mips/xgot.s
+++ b/llvm/test/MC/Mips/xgot.s
@@ -10,8 +10,8 @@
 // CHECK:   0x1C R_MIPS_GOT_LO16 ext_1
 // CHECK:   0x24 R_MIPS_CALL_HI16 printf
 // CHECK:   0x30 R_MIPS_CALL_LO16 printf
-// CHECK:   0x2C R_MIPS_GOT16 .rodata.str1.1
-// CHECK:   0x38 R_MIPS_LO16 .rodata.str1.1
+// CHECK:   0x2C R_MIPS_GOT16 $.str
+// CHECK:   0x38 R_MIPS_LO16 $.str
 // CHECK: ]
 
 	.text

From 28730bc82ac00125d1f5894464294583abf786ef Mon Sep 17 00:00:00 2001
From: LemonBoy <thatlemon@gmail.com>
Date: Sat, 1 May 2021 17:13:50 +0200
Subject: [PATCH 266/318] [AArch64] Prevent spilling between ldxr/stxr pairs

Apply the same logic used to check if CMPXCHG nodes should be expanded
at -O0: the register allocator may end up spilling some register in
between the atomic load/store pairs, breaking the atomicity and possibly
stalling the execution.

Fixes PR48017

Reviewed By: efriedman

Differential Revision: https://reviews.llvm.org/D101163

(cherry picked from commit 4751cadcca45984d7671e594ce95aed8fe030bf1)
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  43 +-
 llvm/test/CodeGen/AArch64/atomicrmw-O0.ll     | 697 ++++++++++++++++++
 .../AArch64/expand-atomicrmw-xchg-fp.ll       |   4 +-
 3 files changed, 726 insertions(+), 18 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/atomicrmw-O0.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 1451151f4dc5..c522ee76626d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16335,25 +16335,36 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
 
   unsigned Size = AI->getType()->getPrimitiveSizeInBits();
   if (Size > 128) return AtomicExpansionKind::None;
-  // Nand not supported in LSE.
-  if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC;
-  // Leave 128 bits to LLSC.
-  if (Subtarget->hasLSE() && Size < 128)
-    return AtomicExpansionKind::None;
-  if (Subtarget->outlineAtomics() && Size < 128) {
-    // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
-    // Don't outline them unless
-    // (1) high level <atomic> support approved:
-    //   http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
-    // (2) low level libgcc and compiler-rt support implemented by:
-    //   min/max outline atomics helpers
-    if (AI->getOperation() != AtomicRMWInst::Min &&
-        AI->getOperation() != AtomicRMWInst::Max &&
-        AI->getOperation() != AtomicRMWInst::UMin &&
-        AI->getOperation() != AtomicRMWInst::UMax) {
+
+  // Nand is not supported in LSE.
+  // Leave 128 bits to LLSC or CmpXChg.
+  if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
+    if (Subtarget->hasLSE())
       return AtomicExpansionKind::None;
+    if (Subtarget->outlineAtomics()) {
+      // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
+      // Don't outline them unless
+      // (1) high level <atomic> support approved:
+      //   http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
+      // (2) low level libgcc and compiler-rt support implemented by:
+      //   min/max outline atomics helpers
+      if (AI->getOperation() != AtomicRMWInst::Min &&
+          AI->getOperation() != AtomicRMWInst::Max &&
+          AI->getOperation() != AtomicRMWInst::UMin &&
+          AI->getOperation() != AtomicRMWInst::UMax) {
+        return AtomicExpansionKind::None;
+      }
     }
   }
+
+  // At -O0, fast-regalloc cannot cope with the live vregs necessary to
+  // implement atomicrmw without spilling. If the target address is also on the
+  // stack and close enough to the spill slot, this can lead to a situation
+  // where the monitor always gets cleared and the atomic operation can never
+  // succeed. So at -O0 lower this operation to a CAS loop.
+  if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
+    return AtomicExpansionKind::CmpXChg;
+
   return AtomicExpansionKind::LLSC;
 }
 
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll b/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll
new file mode 100644
index 000000000000..d3a1f144b851
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll
@@ -0,0 +1,697 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-- -O0 -fast-isel=0 -global-isel=false %s -o - | FileCheck %s -check-prefix=NOLSE
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-- -mattr=+lse -O0 -fast-isel=0 -global-isel=false %s -o - | FileCheck %s -check-prefix=LSE
+
+; Ensure there's no stack spill in between ldxr/stxr pairs.
+
+define i8 @test_rmw_add_8(i8* %dst)   {
+; NOLSE-LABEL: test_rmw_add_8:
+; NOLSE:       // %bb.0: // %entry
+; NOLSE-NEXT:    sub sp, sp, #32 // =32
+; NOLSE-NEXT:    .cfi_def_cfa_offset 32
+; NOLSE-NEXT:    str x0, [sp, #16] // 8-byte Folded Spill
+; NOLSE-NEXT:    ldrb w8, [x0]
+; NOLSE-NEXT:    str w8, [sp, #28] // 4-byte Folded Spill
+; NOLSE-NEXT:    b .LBB0_1
+; NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB0_2 Depth 2
+; NOLSE-NEXT:    ldr w9, [sp, #28] // 4-byte Folded Reload
+; NOLSE-NEXT:    ldr x11, [sp, #16] // 8-byte Folded Reload
+; NOLSE-NEXT:    add w12, w9, #1 // =1
+; NOLSE-NEXT:  .LBB0_2: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB0_1 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxrb w8, [x11]
+; NOLSE-NEXT:    cmp w8, w9, uxtb
+; NOLSE-NEXT:    b.ne .LBB0_4
+; NOLSE-NEXT:  // %bb.3: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB0_2 Depth=2
+; NOLSE-NEXT:    stlxrb w10, w12, [x11]
+; NOLSE-NEXT:    cbnz w10, .LBB0_2
+; NOLSE-NEXT:  .LBB0_4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB0_1 Depth=1
+; NOLSE-NEXT:    subs w9, w8, w9, uxtb
+; NOLSE-NEXT:    cset w9, eq
+; NOLSE-NEXT:    str w8, [sp, #12] // 4-byte Folded Spill
+; NOLSE-NEXT:    subs w9, w9, #1 // =1
+; NOLSE-NEXT:    str w8, [sp, #28] // 4-byte Folded Spill
+; NOLSE-NEXT:    b.ne .LBB0_1
+; NOLSE-NEXT:    b .LBB0_5
+; NOLSE-NEXT:  .LBB0_5: // %atomicrmw.end
+; NOLSE-NEXT:    ldr w0, [sp, #12] // 4-byte Folded Reload
+; NOLSE-NEXT:    add sp, sp, #32 // =32
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_rmw_add_8:
+; LSE:       // %bb.0: // %entry
+; LSE-NEXT:    mov w8, #1
+; LSE-NEXT:    ldaddalb w8, w0, [x0]
+; LSE-NEXT:    ret
+entry:
+  %res = atomicrmw add i8* %dst, i8 1 seq_cst
+  ret i8 %res
+}
+
+define i16 @test_rmw_add_16(i16* %dst)   {
+; NOLSE-LABEL: test_rmw_add_16:
+; NOLSE:       // %bb.0: // %entry
+; NOLSE-NEXT:    sub sp, sp, #32 // =32
+; NOLSE-NEXT:    .cfi_def_cfa_offset 32
+; NOLSE-NEXT:    str x0, [sp, #16] // 8-byte Folded Spill
+; NOLSE-NEXT:    ldrh w8, [x0]
+; NOLSE-NEXT:    str w8, [sp, #28] // 4-byte Folded Spill
+; NOLSE-NEXT:    b .LBB1_1
+; NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB1_2 Depth 2
+; NOLSE-NEXT:    ldr w9, [sp, #28] // 4-byte Folded Reload
+; NOLSE-NEXT:    ldr x11, [sp, #16] // 8-byte Folded Reload
+; NOLSE-NEXT:    add w12, w9, #1 // =1
+; NOLSE-NEXT:  .LBB1_2: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB1_1 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxrh w8, [x11]
+; NOLSE-NEXT:    cmp w8, w9, uxth
+; NOLSE-NEXT:    b.ne .LBB1_4
+; NOLSE-NEXT:  // %bb.3: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB1_2 Depth=2
+; NOLSE-NEXT:    stlxrh w10, w12, [x11]
+; NOLSE-NEXT:    cbnz w10, .LBB1_2
+; NOLSE-NEXT:  .LBB1_4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB1_1 Depth=1
+; NOLSE-NEXT:    subs w9, w8, w9, uxth
+; NOLSE-NEXT:    cset w9, eq
+; NOLSE-NEXT:    str w8, [sp, #12] // 4-byte Folded Spill
+; NOLSE-NEXT:    subs w9, w9, #1 // =1
+; NOLSE-NEXT:    str w8, [sp, #28] // 4-byte Folded Spill
+; NOLSE-NEXT:    b.ne .LBB1_1
+; NOLSE-NEXT:    b .LBB1_5
+; NOLSE-NEXT:  .LBB1_5: // %atomicrmw.end
+; NOLSE-NEXT:    ldr w0, [sp, #12] // 4-byte Folded Reload
+; NOLSE-NEXT:    add sp, sp, #32 // =32
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_rmw_add_16:
+; LSE:       // %bb.0: // %entry
+; LSE-NEXT:    mov w8, #1
+; LSE-NEXT:    ldaddalh w8, w0, [x0]
+; LSE-NEXT:    ret
+entry:
+  %res = atomicrmw add i16* %dst, i16 1 seq_cst
+  ret i16 %res
+}
+
+define i32 @test_rmw_add_32(i32* %dst)   {
+; NOLSE-LABEL: test_rmw_add_32:
+; NOLSE:       // %bb.0: // %entry
+; NOLSE-NEXT:    sub sp, sp, #32 // =32
+; NOLSE-NEXT:    .cfi_def_cfa_offset 32
+; NOLSE-NEXT:    str x0, [sp, #16] // 8-byte Folded Spill
+; NOLSE-NEXT:    ldr w8, [x0]
+; NOLSE-NEXT:    str w8, [sp, #28] // 4-byte Folded Spill
+; NOLSE-NEXT:    b .LBB2_1
+; NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB2_2 Depth 2
+; NOLSE-NEXT:    ldr w9, [sp, #28] // 4-byte Folded Reload
+; NOLSE-NEXT:    ldr x11, [sp, #16] // 8-byte Folded Reload
+; NOLSE-NEXT:    add w12, w9, #1 // =1
+; NOLSE-NEXT:  .LBB2_2: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB2_1 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxr w8, [x11]
+; NOLSE-NEXT:    cmp w8, w9
+; NOLSE-NEXT:    b.ne .LBB2_4
+; NOLSE-NEXT:  // %bb.3: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB2_2 Depth=2
+; NOLSE-NEXT:    stlxr w10, w12, [x11]
+; NOLSE-NEXT:    cbnz w10, .LBB2_2
+; NOLSE-NEXT:  .LBB2_4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB2_1 Depth=1
+; NOLSE-NEXT:    subs w9, w8, w9
+; NOLSE-NEXT:    cset w9, eq
+; NOLSE-NEXT:    str w8, [sp, #12] // 4-byte Folded Spill
+; NOLSE-NEXT:    subs w9, w9, #1 // =1
+; NOLSE-NEXT:    str w8, [sp, #28] // 4-byte Folded Spill
+; NOLSE-NEXT:    b.ne .LBB2_1
+; NOLSE-NEXT:    b .LBB2_5
+; NOLSE-NEXT:  .LBB2_5: // %atomicrmw.end
+; NOLSE-NEXT:    ldr w0, [sp, #12] // 4-byte Folded Reload
+; NOLSE-NEXT:    add sp, sp, #32 // =32
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_rmw_add_32:
+; LSE:       // %bb.0: // %entry
+; LSE-NEXT:    mov w8, #1
+; LSE-NEXT:    ldaddal w8, w0, [x0]
+; LSE-NEXT:    ret
+entry:
+  %res = atomicrmw add i32* %dst, i32 1 seq_cst
+  ret i32 %res
+}
+
+define i64 @test_rmw_add_64(i64* %dst)   {
+; NOLSE-LABEL: test_rmw_add_64:
+; NOLSE:       // %bb.0: // %entry
+; NOLSE-NEXT:    sub sp, sp, #32 // =32
+; NOLSE-NEXT:    .cfi_def_cfa_offset 32
+; NOLSE-NEXT:    str x0, [sp, #16] // 8-byte Folded Spill
+; NOLSE-NEXT:    ldr x8, [x0]
+; NOLSE-NEXT:    str x8, [sp, #24] // 8-byte Folded Spill
+; NOLSE-NEXT:    b .LBB3_1
+; NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB3_2 Depth 2
+; NOLSE-NEXT:    ldr x9, [sp, #24] // 8-byte Folded Reload
+; NOLSE-NEXT:    ldr x11, [sp, #16] // 8-byte Folded Reload
+; NOLSE-NEXT:    add x12, x9, #1 // =1
+; NOLSE-NEXT:  .LBB3_2: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB3_1 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxr x8, [x11]
+; NOLSE-NEXT:    cmp x8, x9
+; NOLSE-NEXT:    b.ne .LBB3_4
+; NOLSE-NEXT:  // %bb.3: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB3_2 Depth=2
+; NOLSE-NEXT:    stlxr w10, x12, [x11]
+; NOLSE-NEXT:    cbnz w10, .LBB3_2
+; NOLSE-NEXT:  .LBB3_4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB3_1 Depth=1
+; NOLSE-NEXT:    subs x9, x8, x9
+; NOLSE-NEXT:    cset w9, eq
+; NOLSE-NEXT:    str x8, [sp, #8] // 8-byte Folded Spill
+; NOLSE-NEXT:    subs w9, w9, #1 // =1
+; NOLSE-NEXT:    str x8, [sp, #24] // 8-byte Folded Spill
+; NOLSE-NEXT:    b.ne .LBB3_1
+; NOLSE-NEXT:    b .LBB3_5
+; NOLSE-NEXT:  .LBB3_5: // %atomicrmw.end
+; NOLSE-NEXT:    ldr x0, [sp, #8] // 8-byte Folded Reload
+; NOLSE-NEXT:    add sp, sp, #32 // =32
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_rmw_add_64:
+; LSE:       // %bb.0: // %entry
+; LSE-NEXT:    mov w8, #1
+; LSE-NEXT:    // kill: def $x8 killed $w8
+; LSE-NEXT:    ldaddal x8, x0, [x0]
+; LSE-NEXT:    ret
+entry:
+  %res = atomicrmw add i64* %dst, i64 1 seq_cst
+  ret i64 %res
+}
+
+define i128 @test_rmw_add_128(i128* %dst)   {
+; NOLSE-LABEL: test_rmw_add_128:
+; NOLSE:       // %bb.0: // %entry
+; NOLSE-NEXT:    sub sp, sp, #48 // =48
+; NOLSE-NEXT:    .cfi_def_cfa_offset 48
+; NOLSE-NEXT:    str x0, [sp, #24] // 8-byte Folded Spill
+; NOLSE-NEXT:    ldr x8, [x0, #8]
+; NOLSE-NEXT:    ldr x9, [x0]
+; NOLSE-NEXT:    str x9, [sp, #32] // 8-byte Folded Spill
+; NOLSE-NEXT:    str x8, [sp, #40] // 8-byte Folded Spill
+; NOLSE-NEXT:    b .LBB4_1
+; NOLSE-NEXT:  .LBB4_1: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB4_2 Depth 2
+; NOLSE-NEXT:    ldr x11, [sp, #40] // 8-byte Folded Reload
+; NOLSE-NEXT:    ldr x8, [sp, #32] // 8-byte Folded Reload
+; NOLSE-NEXT:    ldr x13, [sp, #24] // 8-byte Folded Reload
+; NOLSE-NEXT:    adds x14, x8, #1 // =1
+; NOLSE-NEXT:    mov x9, xzr
+; NOLSE-NEXT:    adcs x15, x11, x9
+; NOLSE-NEXT:  .LBB4_2: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB4_1 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxp x10, x9, [x13]
+; NOLSE-NEXT:    cmp x10, x8
+; NOLSE-NEXT:    cset w12, ne
+; NOLSE-NEXT:    cmp x9, x11
+; NOLSE-NEXT:    cinc w12, w12, ne
+; NOLSE-NEXT:    cbnz w12, .LBB4_4
+; NOLSE-NEXT:  // %bb.3: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=2
+; NOLSE-NEXT:    stlxp w12, x14, x15, [x13]
+; NOLSE-NEXT:    cbnz w12, .LBB4_2
+; NOLSE-NEXT:  .LBB4_4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB4_1 Depth=1
+; NOLSE-NEXT:    eor x11, x9, x11
+; NOLSE-NEXT:    eor x8, x10, x8
+; NOLSE-NEXT:    orr x8, x8, x11
+; NOLSE-NEXT:    str x9, [sp, #8] // 8-byte Folded Spill
+; NOLSE-NEXT:    str x10, [sp, #16] // 8-byte Folded Spill
+; NOLSE-NEXT:    str x10, [sp, #32] // 8-byte Folded Spill
+; NOLSE-NEXT:    str x9, [sp, #40] // 8-byte Folded Spill
+; NOLSE-NEXT:    cbnz x8, .LBB4_1
+; NOLSE-NEXT:    b .LBB4_5
+; NOLSE-NEXT:  .LBB4_5: // %atomicrmw.end
+; NOLSE-NEXT:    ldr x1, [sp, #8] // 8-byte Folded Reload
+; NOLSE-NEXT:    ldr x0, [sp, #16] // 8-byte Folded Reload
+; NOLSE-NEXT:    add sp, sp, #48 // =48
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_rmw_add_128:
+; LSE:       // %bb.0: // %entry
+; LSE-NEXT:    sub sp, sp, #80 // =80
+; LSE-NEXT:    .cfi_def_cfa_offset 80
+; LSE-NEXT:    str x0, [sp, #56] // 8-byte Folded Spill
+; LSE-NEXT:    ldr x8, [x0, #8]
+; LSE-NEXT:    ldr x9, [x0]
+; LSE-NEXT:    str x9, [sp, #64] // 8-byte Folded Spill
+; LSE-NEXT:    str x8, [sp, #72] // 8-byte Folded Spill
+; LSE-NEXT:    b .LBB4_1
+; LSE-NEXT:  .LBB4_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    ldr x10, [sp, #72] // 8-byte Folded Reload
+; LSE-NEXT:    ldr x8, [sp, #64] // 8-byte Folded Reload
+; LSE-NEXT:    ldr x9, [sp, #56] // 8-byte Folded Reload
+; LSE-NEXT:    adds x2, x8, #1 // =1
+; LSE-NEXT:    mov x11, xzr
+; LSE-NEXT:    adcs x11, x10, x11
+; LSE-NEXT:    // kill: def $x2 killed $x2 def $x2_x3
+; LSE-NEXT:    mov x3, x11
+; LSE-NEXT:    mov x0, x8
+; LSE-NEXT:    mov x1, x10
+; LSE-NEXT:    stp x0, x1, [sp, #8] // 16-byte Folded Spill
+; LSE-NEXT:    caspal x0, x1, x2, x3, [x9]
+; LSE-NEXT:    stp x0, x1, [sp, #24] // 16-byte Folded Spill
+; LSE-NEXT:    mov x9, x1
+; LSE-NEXT:    str x9, [sp, #40] // 8-byte Folded Spill
+; LSE-NEXT:    eor x11, x9, x10
+; LSE-NEXT:    mov x10, x0
+; LSE-NEXT:    str x10, [sp, #48] // 8-byte Folded Spill
+; LSE-NEXT:    eor x8, x10, x8
+; LSE-NEXT:    orr x8, x8, x11
+; LSE-NEXT:    str x10, [sp, #64] // 8-byte Folded Spill
+; LSE-NEXT:    str x9, [sp, #72] // 8-byte Folded Spill
+; LSE-NEXT:    cbnz x8, .LBB4_1
+; LSE-NEXT:    b .LBB4_2
+; LSE-NEXT:  .LBB4_2: // %atomicrmw.end
+; LSE-NEXT:    ldr x1, [sp, #40] // 8-byte Folded Reload
+; LSE-NEXT:    ldr x0, [sp, #48] // 8-byte Folded Reload
+; LSE-NEXT:    add sp, sp, #80 // =80
+; LSE-NEXT:    ret
+entry:
+  %res = atomicrmw add i128* %dst, i128 1 seq_cst
+  ret i128 %res
+}
+define i8 @test_rmw_nand_8(i8* %dst)   {
+; NOLSE-LABEL: test_rmw_nand_8:
+; NOLSE:       // %bb.0: // %entry
+; NOLSE-NEXT:    sub sp, sp, #32 // =32
+; NOLSE-NEXT:    .cfi_def_cfa_offset 32
+; NOLSE-NEXT:    str x0, [sp, #16] // 8-byte Folded Spill
+; NOLSE-NEXT:    ldrb w8, [x0]
+; NOLSE-NEXT:    str w8, [sp, #28] // 4-byte Folded Spill
+; NOLSE-NEXT:    b .LBB5_1
+; NOLSE-NEXT:  .LBB5_1: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB5_2 Depth 2
+; NOLSE-NEXT:    ldr w9, [sp, #28] // 4-byte Folded Reload
+; NOLSE-NEXT:    ldr x11, [sp, #16] // 8-byte Folded Reload
+; NOLSE-NEXT:    mvn w8, w9
+; NOLSE-NEXT:    orr w12, w8, #0xfffffffe
+; NOLSE-NEXT:  .LBB5_2: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB5_1 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxrb w8, [x11]
+; NOLSE-NEXT:    cmp w8, w9, uxtb
+; NOLSE-NEXT:    b.ne .LBB5_4
+; NOLSE-NEXT:  // %bb.3: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB5_2 Depth=2
+; NOLSE-NEXT:    stlxrb w10, w12, [x11]
+; NOLSE-NEXT:    cbnz w10, .LBB5_2
+; NOLSE-NEXT:  .LBB5_4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB5_1 Depth=1
+; NOLSE-NEXT:    subs w9, w8, w9, uxtb
+; NOLSE-NEXT:    cset w9, eq
+; NOLSE-NEXT:    str w8, [sp, #12] // 4-byte Folded Spill
+; NOLSE-NEXT:    subs w9, w9, #1 // =1
+; NOLSE-NEXT:    str w8, [sp, #28] // 4-byte Folded Spill
+; NOLSE-NEXT:    b.ne .LBB5_1
+; NOLSE-NEXT:    b .LBB5_5
+; NOLSE-NEXT:  .LBB5_5: // %atomicrmw.end
+; NOLSE-NEXT:    ldr w0, [sp, #12] // 4-byte Folded Reload
+; NOLSE-NEXT:    add sp, sp, #32 // =32
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_rmw_nand_8:
+; LSE:       // %bb.0: // %entry
+; LSE-NEXT:    sub sp, sp, #32 // =32
+; LSE-NEXT:    .cfi_def_cfa_offset 32
+; LSE-NEXT:    str x0, [sp, #16] // 8-byte Folded Spill
+; LSE-NEXT:    ldrb w8, [x0]
+; LSE-NEXT:    str w8, [sp, #28] // 4-byte Folded Spill
+; LSE-NEXT:    b .LBB5_1
+; LSE-NEXT:  .LBB5_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    ldr w9, [sp, #28] // 4-byte Folded Reload
+; LSE-NEXT:    ldr x11, [sp, #16] // 8-byte Folded Reload
+; LSE-NEXT:    mvn w8, w9
+; LSE-NEXT:    orr w10, w8, #0xfffffffe
+; LSE-NEXT:    mov w8, w9
+; LSE-NEXT:    casalb w8, w10, [x11]
+; LSE-NEXT:    str w8, [sp, #8] // 4-byte Folded Spill
+; LSE-NEXT:    subs w9, w8, w9, uxtb
+; LSE-NEXT:    cset w9, eq
+; LSE-NEXT:    str w8, [sp, #12] // 4-byte Folded Spill
+; LSE-NEXT:    subs w9, w9, #1 // =1
+; LSE-NEXT:    str w8, [sp, #28] // 4-byte Folded Spill
+; LSE-NEXT:    b.ne .LBB5_1
+; LSE-NEXT:    b .LBB5_2
+; LSE-NEXT:  .LBB5_2: // %atomicrmw.end
+; LSE-NEXT:    ldr w0, [sp, #12] // 4-byte Folded Reload
+; LSE-NEXT:    add sp, sp, #32 // =32
+; LSE-NEXT:    ret
+entry:
+  %res = atomicrmw nand i8* %dst, i8 1 seq_cst
+  ret i8 %res
+}
+
+define i16 @test_rmw_nand_16(i16* %dst)   {
+; NOLSE-LABEL: test_rmw_nand_16:
+; NOLSE:       // %bb.0: // %entry
+; NOLSE-NEXT:    sub sp, sp, #32 // =32
+; NOLSE-NEXT:    .cfi_def_cfa_offset 32
+; NOLSE-NEXT:    str x0, [sp, #16] // 8-byte Folded Spill
+; NOLSE-NEXT:    ldrh w8, [x0]
+; NOLSE-NEXT:    str w8, [sp, #28] // 4-byte Folded Spill
+; NOLSE-NEXT:    b .LBB6_1
+; NOLSE-NEXT:  .LBB6_1: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB6_2 Depth 2
+; NOLSE-NEXT:    ldr w9, [sp, #28] // 4-byte Folded Reload
+; NOLSE-NEXT:    ldr x11, [sp, #16] // 8-byte Folded Reload
+; NOLSE-NEXT:    mvn w8, w9
+; NOLSE-NEXT:    orr w12, w8, #0xfffffffe
+; NOLSE-NEXT:  .LBB6_2: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB6_1 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxrh w8, [x11]
+; NOLSE-NEXT:    cmp w8, w9, uxth
+; NOLSE-NEXT:    b.ne .LBB6_4
+; NOLSE-NEXT:  // %bb.3: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB6_2 Depth=2
+; NOLSE-NEXT:    stlxrh w10, w12, [x11]
+; NOLSE-NEXT:    cbnz w10, .LBB6_2
+; NOLSE-NEXT:  .LBB6_4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB6_1 Depth=1
+; NOLSE-NEXT:    subs w9, w8, w9, uxth
+; NOLSE-NEXT:    cset w9, eq
+; NOLSE-NEXT:    str w8, [sp, #12] // 4-byte Folded Spill
+; NOLSE-NEXT:    subs w9, w9, #1 // =1
+; NOLSE-NEXT:    str w8, [sp, #28] // 4-byte Folded Spill
+; NOLSE-NEXT:    b.ne .LBB6_1
+; NOLSE-NEXT:    b .LBB6_5
+; NOLSE-NEXT:  .LBB6_5: // %atomicrmw.end
+; NOLSE-NEXT:    ldr w0, [sp, #12] // 4-byte Folded Reload
+; NOLSE-NEXT:    add sp, sp, #32 // =32
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_rmw_nand_16:
+; LSE:       // %bb.0: // %entry
+; LSE-NEXT:    sub sp, sp, #32 // =32
+; LSE-NEXT:    .cfi_def_cfa_offset 32
+; LSE-NEXT:    str x0, [sp, #16] // 8-byte Folded Spill
+; LSE-NEXT:    ldrh w8, [x0]
+; LSE-NEXT:    str w8, [sp, #28] // 4-byte Folded Spill
+; LSE-NEXT:    b .LBB6_1
+; LSE-NEXT:  .LBB6_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    ldr w9, [sp, #28] // 4-byte Folded Reload
+; LSE-NEXT:    ldr x11, [sp, #16] // 8-byte Folded Reload
+; LSE-NEXT:    mvn w8, w9
+; LSE-NEXT:    orr w10, w8, #0xfffffffe
+; LSE-NEXT:    mov w8, w9
+; LSE-NEXT:    casalh w8, w10, [x11]
+; LSE-NEXT:    str w8, [sp, #8] // 4-byte Folded Spill
+; LSE-NEXT:    subs w9, w8, w9, uxth
+; LSE-NEXT:    cset w9, eq
+; LSE-NEXT:    str w8, [sp, #12] // 4-byte Folded Spill
+; LSE-NEXT:    subs w9, w9, #1 // =1
+; LSE-NEXT:    str w8, [sp, #28] // 4-byte Folded Spill
+; LSE-NEXT:    b.ne .LBB6_1
+; LSE-NEXT:    b .LBB6_2
+; LSE-NEXT:  .LBB6_2: // %atomicrmw.end
+; LSE-NEXT:    ldr w0, [sp, #12] // 4-byte Folded Reload
+; LSE-NEXT:    add sp, sp, #32 // =32
+; LSE-NEXT:    ret
+entry:
+  %res = atomicrmw nand i16* %dst, i16 1 seq_cst
+  ret i16 %res
+}
+
+define i32 @test_rmw_nand_32(i32* %dst)   {
+; NOLSE-LABEL: test_rmw_nand_32:
+; NOLSE:       // %bb.0: // %entry
+; NOLSE-NEXT:    sub sp, sp, #32 // =32
+; NOLSE-NEXT:    .cfi_def_cfa_offset 32
+; NOLSE-NEXT:    str x0, [sp, #16] // 8-byte Folded Spill
+; NOLSE-NEXT:    ldr w8, [x0]
+; NOLSE-NEXT:    str w8, [sp, #28] // 4-byte Folded Spill
+; NOLSE-NEXT:    b .LBB7_1
+; NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB7_2 Depth 2
+; NOLSE-NEXT:    ldr w9, [sp, #28] // 4-byte Folded Reload
+; NOLSE-NEXT:    ldr x11, [sp, #16] // 8-byte Folded Reload
+; NOLSE-NEXT:    mvn w8, w9
+; NOLSE-NEXT:    orr w12, w8, #0xfffffffe
+; NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB7_1 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxr w8, [x11]
+; NOLSE-NEXT:    cmp w8, w9
+; NOLSE-NEXT:    b.ne .LBB7_4
+; NOLSE-NEXT:  // %bb.3: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=2
+; NOLSE-NEXT:    stlxr w10, w12, [x11]
+; NOLSE-NEXT:    cbnz w10, .LBB7_2
+; NOLSE-NEXT:  .LBB7_4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB7_1 Depth=1
+; NOLSE-NEXT:    subs w9, w8, w9
+; NOLSE-NEXT:    cset w9, eq
+; NOLSE-NEXT:    str w8, [sp, #12] // 4-byte Folded Spill
+; NOLSE-NEXT:    subs w9, w9, #1 // =1
+; NOLSE-NEXT:    str w8, [sp, #28] // 4-byte Folded Spill
+; NOLSE-NEXT:    b.ne .LBB7_1
+; NOLSE-NEXT:    b .LBB7_5
+; NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
+; NOLSE-NEXT:    ldr w0, [sp, #12] // 4-byte Folded Reload
+; NOLSE-NEXT:    add sp, sp, #32 // =32
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_rmw_nand_32:
+; LSE:       // %bb.0: // %entry
+; LSE-NEXT:    sub sp, sp, #32 // =32
+; LSE-NEXT:    .cfi_def_cfa_offset 32
+; LSE-NEXT:    str x0, [sp, #16] // 8-byte Folded Spill
+; LSE-NEXT:    ldr w8, [x0]
+; LSE-NEXT:    str w8, [sp, #28] // 4-byte Folded Spill
+; LSE-NEXT:    b .LBB7_1
+; LSE-NEXT:  .LBB7_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    ldr w9, [sp, #28] // 4-byte Folded Reload
+; LSE-NEXT:    ldr x11, [sp, #16] // 8-byte Folded Reload
+; LSE-NEXT:    mvn w8, w9
+; LSE-NEXT:    orr w10, w8, #0xfffffffe
+; LSE-NEXT:    mov w8, w9
+; LSE-NEXT:    casal w8, w10, [x11]
+; LSE-NEXT:    str w8, [sp, #8] // 4-byte Folded Spill
+; LSE-NEXT:    subs w9, w8, w9
+; LSE-NEXT:    cset w9, eq
+; LSE-NEXT:    str w8, [sp, #12] // 4-byte Folded Spill
+; LSE-NEXT:    subs w9, w9, #1 // =1
+; LSE-NEXT:    str w8, [sp, #28] // 4-byte Folded Spill
+; LSE-NEXT:    b.ne .LBB7_1
+; LSE-NEXT:    b .LBB7_2
+; LSE-NEXT:  .LBB7_2: // %atomicrmw.end
+; LSE-NEXT:    ldr w0, [sp, #12] // 4-byte Folded Reload
+; LSE-NEXT:    add sp, sp, #32 // =32
+; LSE-NEXT:    ret
+entry:
+  %res = atomicrmw nand i32* %dst, i32 1 seq_cst
+  ret i32 %res
+}
+
+define i64 @test_rmw_nand_64(i64* %dst)   {
+; NOLSE-LABEL: test_rmw_nand_64:
+; NOLSE:       // %bb.0: // %entry
+; NOLSE-NEXT:    sub sp, sp, #32 // =32
+; NOLSE-NEXT:    .cfi_def_cfa_offset 32
+; NOLSE-NEXT:    str x0, [sp, #16] // 8-byte Folded Spill
+; NOLSE-NEXT:    ldr x8, [x0]
+; NOLSE-NEXT:    str x8, [sp, #24] // 8-byte Folded Spill
+; NOLSE-NEXT:    b .LBB8_1
+; NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB8_2 Depth 2
+; NOLSE-NEXT:    ldr x9, [sp, #24] // 8-byte Folded Reload
+; NOLSE-NEXT:    ldr x11, [sp, #16] // 8-byte Folded Reload
+; NOLSE-NEXT:    mov w8, w9
+; NOLSE-NEXT:    mvn w10, w8
+; NOLSE-NEXT:    // implicit-def: $x8
+; NOLSE-NEXT:    mov w8, w10
+; NOLSE-NEXT:    orr x12, x8, #0xfffffffffffffffe
+; NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB8_1 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxr x8, [x11]
+; NOLSE-NEXT:    cmp x8, x9
+; NOLSE-NEXT:    b.ne .LBB8_4
+; NOLSE-NEXT:  // %bb.3: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=2
+; NOLSE-NEXT:    stlxr w10, x12, [x11]
+; NOLSE-NEXT:    cbnz w10, .LBB8_2
+; NOLSE-NEXT:  .LBB8_4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB8_1 Depth=1
+; NOLSE-NEXT:    subs x9, x8, x9
+; NOLSE-NEXT:    cset w9, eq
+; NOLSE-NEXT:    str x8, [sp, #8] // 8-byte Folded Spill
+; NOLSE-NEXT:    subs w9, w9, #1 // =1
+; NOLSE-NEXT:    str x8, [sp, #24] // 8-byte Folded Spill
+; NOLSE-NEXT:    b.ne .LBB8_1
+; NOLSE-NEXT:    b .LBB8_5
+; NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
+; NOLSE-NEXT:    ldr x0, [sp, #8] // 8-byte Folded Reload
+; NOLSE-NEXT:    add sp, sp, #32 // =32
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_rmw_nand_64:
+; LSE:       // %bb.0: // %entry
+; LSE-NEXT:    sub sp, sp, #32 // =32
+; LSE-NEXT:    .cfi_def_cfa_offset 32
+; LSE-NEXT:    str x0, [sp, #16] // 8-byte Folded Spill
+; LSE-NEXT:    ldr x8, [x0]
+; LSE-NEXT:    str x8, [sp, #24] // 8-byte Folded Spill
+; LSE-NEXT:    b .LBB8_1
+; LSE-NEXT:  .LBB8_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    ldr x9, [sp, #24] // 8-byte Folded Reload
+; LSE-NEXT:    ldr x11, [sp, #16] // 8-byte Folded Reload
+; LSE-NEXT:    mov w8, w9
+; LSE-NEXT:    mvn w10, w8
+; LSE-NEXT:    // implicit-def: $x8
+; LSE-NEXT:    mov w8, w10
+; LSE-NEXT:    orr x10, x8, #0xfffffffffffffffe
+; LSE-NEXT:    mov x8, x9
+; LSE-NEXT:    casal x8, x10, [x11]
+; LSE-NEXT:    str x8, [sp] // 8-byte Folded Spill
+; LSE-NEXT:    subs x9, x8, x9
+; LSE-NEXT:    cset w9, eq
+; LSE-NEXT:    str x8, [sp, #8] // 8-byte Folded Spill
+; LSE-NEXT:    subs w9, w9, #1 // =1
+; LSE-NEXT:    str x8, [sp, #24] // 8-byte Folded Spill
+; LSE-NEXT:    b.ne .LBB8_1
+; LSE-NEXT:    b .LBB8_2
+; LSE-NEXT:  .LBB8_2: // %atomicrmw.end
+; LSE-NEXT:    ldr x0, [sp, #8] // 8-byte Folded Reload
+; LSE-NEXT:    add sp, sp, #32 // =32
+; LSE-NEXT:    ret
+entry:
+  %res = atomicrmw nand i64* %dst, i64 1 seq_cst
+  ret i64 %res
+}
+
+define i128 @test_rmw_nand_128(i128* %dst)   {
+; NOLSE-LABEL: test_rmw_nand_128:
+; NOLSE:       // %bb.0: // %entry
+; NOLSE-NEXT:    sub sp, sp, #48 // =48
+; NOLSE-NEXT:    .cfi_def_cfa_offset 48
+; NOLSE-NEXT:    str x0, [sp, #24] // 8-byte Folded Spill
+; NOLSE-NEXT:    ldr x8, [x0, #8]
+; NOLSE-NEXT:    ldr x9, [x0]
+; NOLSE-NEXT:    str x9, [sp, #32] // 8-byte Folded Spill
+; NOLSE-NEXT:    str x8, [sp, #40] // 8-byte Folded Spill
+; NOLSE-NEXT:    b .LBB9_1
+; NOLSE-NEXT:  .LBB9_1: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB9_2 Depth 2
+; NOLSE-NEXT:    ldr x11, [sp, #40] // 8-byte Folded Reload
+; NOLSE-NEXT:    ldr x8, [sp, #32] // 8-byte Folded Reload
+; NOLSE-NEXT:    ldr x13, [sp, #24] // 8-byte Folded Reload
+; NOLSE-NEXT:    mov w9, w8
+; NOLSE-NEXT:    mvn w10, w9
+; NOLSE-NEXT:    // implicit-def: $x9
+; NOLSE-NEXT:    mov w9, w10
+; NOLSE-NEXT:    orr x14, x9, #0xfffffffffffffffe
+; NOLSE-NEXT:    mov x15, #-1
+; NOLSE-NEXT:  .LBB9_2: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB9_1 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxp x10, x9, [x13]
+; NOLSE-NEXT:    cmp x10, x8
+; NOLSE-NEXT:    cset w12, ne
+; NOLSE-NEXT:    cmp x9, x11
+; NOLSE-NEXT:    cinc w12, w12, ne
+; NOLSE-NEXT:    cbnz w12, .LBB9_4
+; NOLSE-NEXT:  // %bb.3: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB9_2 Depth=2
+; NOLSE-NEXT:    stlxp w12, x14, x15, [x13]
+; NOLSE-NEXT:    cbnz w12, .LBB9_2
+; NOLSE-NEXT:  .LBB9_4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB9_1 Depth=1
+; NOLSE-NEXT:    eor x11, x9, x11
+; NOLSE-NEXT:    eor x8, x10, x8
+; NOLSE-NEXT:    orr x8, x8, x11
+; NOLSE-NEXT:    str x9, [sp, #8] // 8-byte Folded Spill
+; NOLSE-NEXT:    str x10, [sp, #16] // 8-byte Folded Spill
+; NOLSE-NEXT:    str x10, [sp, #32] // 8-byte Folded Spill
+; NOLSE-NEXT:    str x9, [sp, #40] // 8-byte Folded Spill
+; NOLSE-NEXT:    cbnz x8, .LBB9_1
+; NOLSE-NEXT:    b .LBB9_5
+; NOLSE-NEXT:  .LBB9_5: // %atomicrmw.end
+; NOLSE-NEXT:    ldr x1, [sp, #8] // 8-byte Folded Reload
+; NOLSE-NEXT:    ldr x0, [sp, #16] // 8-byte Folded Reload
+; NOLSE-NEXT:    add sp, sp, #48 // =48
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_rmw_nand_128:
+; LSE:       // %bb.0: // %entry
+; LSE-NEXT:    sub sp, sp, #80 // =80
+; LSE-NEXT:    .cfi_def_cfa_offset 80
+; LSE-NEXT:    str x0, [sp, #56] // 8-byte Folded Spill
+; LSE-NEXT:    ldr x8, [x0, #8]
+; LSE-NEXT:    ldr x9, [x0]
+; LSE-NEXT:    str x9, [sp, #64] // 8-byte Folded Spill
+; LSE-NEXT:    str x8, [sp, #72] // 8-byte Folded Spill
+; LSE-NEXT:    b .LBB9_1
+; LSE-NEXT:  .LBB9_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    ldr x10, [sp, #72] // 8-byte Folded Reload
+; LSE-NEXT:    ldr x8, [sp, #64] // 8-byte Folded Reload
+; LSE-NEXT:    ldr x9, [sp, #56] // 8-byte Folded Reload
+; LSE-NEXT:    mov x0, x8
+; LSE-NEXT:    mov x1, x10
+; LSE-NEXT:    stp x0, x1, [sp, #8] // 16-byte Folded Spill
+; LSE-NEXT:    mov w11, w8
+; LSE-NEXT:    mvn w12, w11
+; LSE-NEXT:    // implicit-def: $x11
+; LSE-NEXT:    mov w11, w12
+; LSE-NEXT:    orr x2, x11, #0xfffffffffffffffe
+; LSE-NEXT:    mov x11, #-1
+; LSE-NEXT:    // kill: def $x2 killed $x2 def $x2_x3
+; LSE-NEXT:    mov x3, x11
+; LSE-NEXT:    caspal x0, x1, x2, x3, [x9]
+; LSE-NEXT:    stp x0, x1, [sp, #24] // 16-byte Folded Spill
+; LSE-NEXT:    mov x9, x1
+; LSE-NEXT:    str x9, [sp, #40] // 8-byte Folded Spill
+; LSE-NEXT:    eor x11, x9, x10
+; LSE-NEXT:    mov x10, x0
+; LSE-NEXT:    str x10, [sp, #48] // 8-byte Folded Spill
+; LSE-NEXT:    eor x8, x10, x8
+; LSE-NEXT:    orr x8, x8, x11
+; LSE-NEXT:    str x10, [sp, #64] // 8-byte Folded Spill
+; LSE-NEXT:    str x9, [sp, #72] // 8-byte Folded Spill
+; LSE-NEXT:    cbnz x8, .LBB9_1
+; LSE-NEXT:    b .LBB9_2
+; LSE-NEXT:  .LBB9_2: // %atomicrmw.end
+; LSE-NEXT:    ldr x1, [sp, #40] // 8-byte Folded Reload
+; LSE-NEXT:    ldr x0, [sp, #48] // 8-byte Folded Reload
+; LSE-NEXT:    add sp, sp, #80 // =80
+; LSE-NEXT:    ret
+entry:
+  %res = atomicrmw nand i128* %dst, i128 1 seq_cst
+  ret i128 %res
+}
diff --git a/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll b/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll
index ece757f215a0..86b4cff46980 100644
--- a/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=aarch64-- -atomic-expand %s | FileCheck %s
-; RUN: opt -S -mtriple=aarch64-- -mattr=+outline-atomics -atomic-expand %s | FileCheck %s --check-prefix=OUTLINE-ATOMICS
+; RUN: opt -O1 -S -mtriple=aarch64-- -atomic-expand %s | FileCheck %s
+; RUN: opt -O1 -S -mtriple=aarch64-- -mattr=+outline-atomics -atomic-expand %s | FileCheck %s --check-prefix=OUTLINE-ATOMICS
 
 define void @atomic_swap_f16(half* %ptr, half %val) nounwind {
 ; CHECK-LABEL: @atomic_swap_f16(

From 84e8b1cf07b9845ea8e1e07ed5ccc3c5a70d975b Mon Sep 17 00:00:00 2001
From: Sam McCall <sam.mccall@gmail.com>
Date: Thu, 15 Apr 2021 12:52:58 +0200
Subject: [PATCH 267/318] [clangd] Only allow remote index to be enabled from
 user config.

Differential Revision: https://reviews.llvm.org/D100542

(cherry picked from commit ecf93a716c9ecf2e38898547df90323e239a623c)
---
 clang-tools-extra/clangd/ConfigCompile.cpp    | 13 +++++++--
 clang-tools-extra/clangd/ConfigFragment.h     |  3 +++
 clang-tools-extra/clangd/ConfigProvider.cpp   | 27 +++++++++++--------
 clang-tools-extra/clangd/ConfigProvider.h     |  6 +++--
 clang-tools-extra/clangd/tool/ClangdMain.cpp  |  4 +--
 .../clangd/unittests/ConfigCompileTests.cpp   | 14 ++++++++++
 6 files changed, 50 insertions(+), 17 deletions(-)

diff --git a/clang-tools-extra/clangd/ConfigCompile.cpp b/clang-tools-extra/clangd/ConfigCompile.cpp
index b4f0d6186886..7d5466778a81 100644
--- a/clang-tools-extra/clangd/ConfigCompile.cpp
+++ b/clang-tools-extra/clangd/ConfigCompile.cpp
@@ -101,6 +101,7 @@ struct FragmentCompiler {
   llvm::SourceMgr *SourceMgr;
   // Normalized Fragment::SourceInfo::Directory.
   std::string FragmentDirectory;
+  bool Trusted = false;
 
   llvm::Optional<llvm::Regex>
   compileRegex(const Located<std::string> &Text,
@@ -183,6 +184,7 @@ struct FragmentCompiler {
   }
 
   void compile(Fragment &&F) {
+    Trusted = F.Source.Trusted;
     if (!F.Source.Directory.empty()) {
       FragmentDirectory = llvm::sys::path::convert_to_slash(F.Source.Directory);
       if (FragmentDirectory.back() != '/')
@@ -319,6 +321,13 @@ struct FragmentCompiler {
 
   void compile(Fragment::IndexBlock::ExternalBlock &&External,
                llvm::SMRange BlockRange) {
+    if (External.Server && !Trusted) {
+      diag(Error,
+           "Remote index may not be specified by untrusted configuration. "
+           "Copy this into user config to use it.",
+           External.Server->Range);
+      return;
+    }
 #ifndef CLANGD_ENABLE_REMOTE
     if (External.Server) {
       elog("Clangd isn't compiled with remote index support, ignoring Server: "
@@ -489,8 +498,8 @@ CompiledFragment Fragment::compile(DiagnosticCallback D) && {
   trace::Span Tracer("ConfigCompile");
   SPAN_ATTACH(Tracer, "ConfigFile", ConfigFile);
   auto Result = std::make_shared<CompiledFragmentImpl>();
-  vlog("Config fragment: compiling {0}:{1} -> {2}", ConfigFile, LineCol.first,
-       Result.get());
+  vlog("Config fragment: compiling {0}:{1} -> {2} (trusted={3})", ConfigFile,
+       LineCol.first, Result.get(), Source.Trusted);
 
   FragmentCompiler{*Result, D, Source.Manager.get()}.compile(std::move(*this));
   // Return as cheaply-copyable wrapper.
diff --git a/clang-tools-extra/clangd/ConfigFragment.h b/clang-tools-extra/clangd/ConfigFragment.h
index c36b07f5e8e2..c98ca3a2dd52 100644
--- a/clang-tools-extra/clangd/ConfigFragment.h
+++ b/clang-tools-extra/clangd/ConfigFragment.h
@@ -94,6 +94,9 @@ struct Fragment {
     /// Absolute path to directory the fragment is associated with. Relative
     /// paths mentioned in the fragment are resolved against this.
     std::string Directory;
+    /// Whether this fragment is allowed to make critical security/privacy
+    /// decisions.
+    bool Trusted = false;
   };
   SourceInfo Source;
 
diff --git a/clang-tools-extra/clangd/ConfigProvider.cpp b/clang-tools-extra/clangd/ConfigProvider.cpp
index 05b2ba50566d..6dfb00b14fc6 100644
--- a/clang-tools-extra/clangd/ConfigProvider.cpp
+++ b/clang-tools-extra/clangd/ConfigProvider.cpp
@@ -34,7 +34,7 @@ class FileConfigCache : public FileCache {
       : FileCache(Path), Directory(Directory) {}
 
   void get(const ThreadsafeFS &TFS, DiagnosticCallback DC,
-           std::chrono::steady_clock::time_point FreshTime,
+           std::chrono::steady_clock::time_point FreshTime, bool Trusted,
            std::vector<CompiledFragment> &Out) const {
     read(
         TFS, FreshTime,
@@ -43,6 +43,7 @@ class FileConfigCache : public FileCache {
           if (Data)
             for (auto &Fragment : Fragment::parseYAML(*Data, path(), DC)) {
               Fragment.Source.Directory = Directory;
+              Fragment.Source.Trusted = Trusted;
               CachedValue.push_back(std::move(Fragment).compile(DC));
             }
         },
@@ -52,35 +53,38 @@ class FileConfigCache : public FileCache {
 
 std::unique_ptr<Provider> Provider::fromYAMLFile(llvm::StringRef AbsPath,
                                                  llvm::StringRef Directory,
-                                                 const ThreadsafeFS &FS) {
+                                                 const ThreadsafeFS &FS,
+                                                 bool Trusted) {
   class AbsFileProvider : public Provider {
     mutable FileConfigCache Cache; // threadsafe
     const ThreadsafeFS &FS;
+    bool Trusted;
 
     std::vector<CompiledFragment>
     getFragments(const Params &P, DiagnosticCallback DC) const override {
       std::vector<CompiledFragment> Result;
-      Cache.get(FS, DC, P.FreshTime, Result);
+      Cache.get(FS, DC, P.FreshTime, Trusted, Result);
       return Result;
     };
 
   public:
     AbsFileProvider(llvm::StringRef Path, llvm::StringRef Directory,
-                    const ThreadsafeFS &FS)
-        : Cache(Path, Directory), FS(FS) {
+                    const ThreadsafeFS &FS, bool Trusted)
+        : Cache(Path, Directory), FS(FS), Trusted(Trusted) {
       assert(llvm::sys::path::is_absolute(Path));
     }
   };
 
-  return std::make_unique<AbsFileProvider>(AbsPath, Directory, FS);
+  return std::make_unique<AbsFileProvider>(AbsPath, Directory, FS, Trusted);
 }
 
 std::unique_ptr<Provider>
 Provider::fromAncestorRelativeYAMLFiles(llvm::StringRef RelPath,
-                                        const ThreadsafeFS &FS) {
+                                        const ThreadsafeFS &FS, bool Trusted) {
   class RelFileProvider : public Provider {
     std::string RelPath;
     const ThreadsafeFS &FS;
+    bool Trusted;
 
     mutable std::mutex Mu;
     // Keys are the (posix-style) ancestor directory, not the config within it.
@@ -128,18 +132,19 @@ Provider::fromAncestorRelativeYAMLFiles(llvm::StringRef RelPath,
       // This will take a (per-file) lock for each file that actually exists.
       std::vector<CompiledFragment> Result;
       for (FileConfigCache *Cache : Caches)
-        Cache->get(FS, DC, P.FreshTime, Result);
+        Cache->get(FS, DC, P.FreshTime, Trusted, Result);
       return Result;
     };
 
   public:
-    RelFileProvider(llvm::StringRef RelPath, const ThreadsafeFS &FS)
-        : RelPath(RelPath), FS(FS) {
+    RelFileProvider(llvm::StringRef RelPath, const ThreadsafeFS &FS,
+                    bool Trusted)
+        : RelPath(RelPath), FS(FS), Trusted(Trusted) {
       assert(llvm::sys::path::is_relative(RelPath));
     }
   };
 
-  return std::make_unique<RelFileProvider>(RelPath, FS);
+  return std::make_unique<RelFileProvider>(RelPath, FS, Trusted);
 }
 
 std::unique_ptr<Provider>
diff --git a/clang-tools-extra/clangd/ConfigProvider.h b/clang-tools-extra/clangd/ConfigProvider.h
index 25d9450f28a7..428438b67f14 100644
--- a/clang-tools-extra/clangd/ConfigProvider.h
+++ b/clang-tools-extra/clangd/ConfigProvider.h
@@ -69,7 +69,8 @@ class Provider {
   /// Directory will be used to resolve relative paths in the fragments.
   static std::unique_ptr<Provider> fromYAMLFile(llvm::StringRef AbsPath,
                                                 llvm::StringRef Directory,
-                                                const ThreadsafeFS &);
+                                                const ThreadsafeFS &,
+                                                bool Trusted = false);
   // Reads fragments from YAML files found relative to ancestors of Params.Path.
   //
   // All fragments that exist are returned, starting from distant ancestors.
@@ -78,7 +79,8 @@ class Provider {
   //
   // If Params does not specify a path, no fragments are returned.
   static std::unique_ptr<Provider>
-  fromAncestorRelativeYAMLFiles(llvm::StringRef RelPath, const ThreadsafeFS &);
+  fromAncestorRelativeYAMLFiles(llvm::StringRef RelPath, const ThreadsafeFS &,
+                                bool Trusted = false);
 
   /// A provider that includes fragments from all the supplied providers.
   /// Order is preserved; later providers take precedence over earlier ones.
diff --git a/clang-tools-extra/clangd/tool/ClangdMain.cpp b/clang-tools-extra/clangd/tool/ClangdMain.cpp
index fe69079bfe67..99c3d97ce35d 100644
--- a/clang-tools-extra/clangd/tool/ClangdMain.cpp
+++ b/clang-tools-extra/clangd/tool/ClangdMain.cpp
@@ -831,8 +831,8 @@ clangd accepts flags on the commandline, and in the CLANGD_FLAGS environment var
     if (llvm::sys::path::user_config_directory(UserConfig)) {
       llvm::sys::path::append(UserConfig, "clangd", "config.yaml");
       vlog("User config file is {0}", UserConfig);
-      ProviderStack.push_back(
-          config::Provider::fromYAMLFile(UserConfig, /*Directory=*/"", TFS));
+      ProviderStack.push_back(config::Provider::fromYAMLFile(
+          UserConfig, /*Directory=*/"", TFS, /*Trusted=*/true));
     } else {
       elog("Couldn't determine user config file, not loading");
     }
diff --git a/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp b/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp
index 4961d3474fd9..a3738681bec5 100644
--- a/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp
+++ b/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp
@@ -318,7 +318,21 @@ TEST_F(ConfigCompileTests, TidyBadChecks) {
               DiagKind(llvm::SourceMgr::DK_Warning))));
 }
 
+TEST_F(ConfigCompileTests, ExternalServerNeedsTrusted) {
+  Fragment::IndexBlock::ExternalBlock External;
+  External.Server.emplace("xxx");
+  Frag.Index.External = std::move(External);
+  compileAndApply();
+  EXPECT_THAT(
+      Diags.Diagnostics,
+      ElementsAre(DiagMessage(
+          "Remote index may not be specified by untrusted configuration. "
+          "Copy this into user config to use it.")));
+  EXPECT_FALSE(Conf.Index.External.hasValue());
+}
+
 TEST_F(ConfigCompileTests, ExternalBlockWarnOnMultipleSource) {
+  Frag.Source.Trusted = true;
   Fragment::IndexBlock::ExternalBlock External;
   External.File.emplace("");
   External.Server.emplace("");

From 42326932eca6d49fc59f8f560917be16764e087e Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 9 Jun 2021 08:57:38 -0400
Subject: [PATCH 268/318] [x86] add tests for store merging miscompile
 (PR50623); NFC

(cherry picked from commit 2ef81cb297954cdbc2eca2f204a5ecba4ec1ccd8)
---
 llvm/test/CodeGen/X86/stores-merging.ll | 95 +++++++++++++++++++++++--
 1 file changed, 90 insertions(+), 5 deletions(-)

diff --git a/llvm/test/CodeGen/X86/stores-merging.ll b/llvm/test/CodeGen/X86/stores-merging.ll
index d92342ab7fa8..d4857a6645af 100644
--- a/llvm/test/CodeGen/X86/stores-merging.ll
+++ b/llvm/test/CodeGen/X86/stores-merging.ll
@@ -14,7 +14,7 @@ define dso_local void @redundant_stores_merging() {
 ; CHECK-LABEL: redundant_stores_merging:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movabsq $1958505086977, %rax # imm = 0x1C800000001
-; CHECK-NEXT:    movq %rax, e+{{.*}}(%rip)
+; CHECK-NEXT:    movq %rax, e+4(%rip)
 ; CHECK-NEXT:    retq
   store i32 1, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 1), align 4
   store i32 123, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4
@@ -27,8 +27,8 @@ define dso_local void @redundant_stores_merging_reverse() {
 ; CHECK-LABEL: redundant_stores_merging_reverse:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movabsq $528280977409, %rax # imm = 0x7B00000001
-; CHECK-NEXT:    movq %rax, e+{{.*}}(%rip)
-; CHECK-NEXT:    movl $456, e+{{.*}}(%rip) # imm = 0x1C8
+; CHECK-NEXT:    movq %rax, e+4(%rip)
+; CHECK-NEXT:    movl $456, e+8(%rip) # imm = 0x1C8
 ; CHECK-NEXT:    retq
   store i32 123, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4
   store i32 456, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4
@@ -46,8 +46,8 @@ define dso_local void @redundant_stores_merging_reverse() {
 define dso_local void @overlapping_stores_merging() {
 ; CHECK-LABEL: overlapping_stores_merging:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl $1, {{.*}}(%rip)
-; CHECK-NEXT:    movw $2, b+{{.*}}(%rip)
+; CHECK-NEXT:    movl $1, b(%rip)
+; CHECK-NEXT:    movw $2, b+3(%rip)
 ; CHECK-NEXT:    retq
   store i16 0, i16* bitcast (i8* getelementptr inbounds ([8 x i8], [8 x i8]* @b, i64 0, i64 2) to i16*), align 2
   store i16 2, i16* bitcast (i8* getelementptr inbounds ([8 x i8], [8 x i8]* @b, i64 0, i64 3) to i16*), align 1
@@ -612,3 +612,88 @@ define dso_local void @be_i64_to_i32_order(i64 %x, i32* %p0) {
   store i32 %t0, i32* %p1, align 4
   ret void
 }
+
+; https://llvm.org/PR50623
+; FIXME:
+; It is a miscompile to merge the stores if we are not
+; writing all of the bytes from the source value.
+
+define void @merge_hole(i32 %x, i8* %p) {
+; CHECK-LABEL: merge_hole:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, (%rsi)
+; CHECK-NEXT:    retq
+  %pcast = bitcast i8* %p to i16*
+  %p2 = getelementptr inbounds i16, i16* %pcast, i64 1
+  %x3 = trunc i32 %x to i8
+  store i8 %x3, i8* %p, align 1
+  %sh = lshr i32 %x, 16
+  %x01 = trunc i32 %sh to i16
+  store i16 %x01, i16* %p2, align 1
+  ret void
+}
+
+; Change the order of the stores.
+; It is a miscompile to merge the stores if we are not
+; writing all of the bytes from the source value.
+
+define void @merge_hole2(i32 %x, i8* %p) {
+; CHECK-LABEL: merge_hole2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shrl $16, %eax
+; CHECK-NEXT:    movw %ax, 2(%rsi)
+; CHECK-NEXT:    movb %dil, (%rsi)
+; CHECK-NEXT:    retq
+  %pcast = bitcast i8* %p to i16*
+  %p2 = getelementptr inbounds i16, i16* %pcast, i64 1
+  %sh = lshr i32 %x, 16
+  %x01 = trunc i32 %sh to i16
+  store i16 %x01, i16* %p2, align 1
+  %x3 = trunc i32 %x to i8
+  store i8 %x3, i8* %p, align 1
+  ret void
+}
+
+; Change offset.
+; It is a miscompile to merge the stores if we are not
+; writing all of the bytes from the source value.
+
+define void @merge_hole3(i32 %x, i8* %p) {
+; CHECK-LABEL: merge_hole3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movb %dil, 1(%rsi)
+; CHECK-NEXT:    shrl $16, %edi
+; CHECK-NEXT:    movw %di, 2(%rsi)
+; CHECK-NEXT:    retq
+  %p1 = getelementptr inbounds i8, i8* %p, i64 1
+  %pcast = bitcast i8* %p to i16*
+  %p2 = getelementptr inbounds i16, i16* %pcast, i64 1
+  %x3 = trunc i32 %x to i8
+  store i8 %x3, i8* %p1, align 1
+  %sh = lshr i32 %x, 16
+  %x01 = trunc i32 %sh to i16
+  store i16 %x01, i16* %p2, align 1
+  ret void
+}
+
+; Change offset.
+; FIXME:
+; It is a miscompile to merge the stores if we are not
+; writing all of the bytes from the source value.
+
+define void @merge_hole4(i32 %x, i8* %p) {
+; CHECK-LABEL: merge_hole4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rorl $16, %edi
+; CHECK-NEXT:    movl %edi, (%rsi)
+; CHECK-NEXT:    retq
+  %pcast = bitcast i8* %p to i16*
+  %p2 = getelementptr inbounds i8, i8* %p, i64 2
+  %x3 = trunc i32 %x to i8
+  store i8 %x3, i8* %p2, align 1
+  %sh = lshr i32 %x, 16
+  %x01 = trunc i32 %sh to i16
+  store i16 %x01, i16* %pcast, align 1
+  ret void
+}

From b54ccef144d2753a9742a3c0e75bcf377120fc6c Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 9 Jun 2021 09:44:58 -0400
Subject: [PATCH 269/318] [SDAG] fix miscompile from merging stores of
 different sizes

As shown in:
https://llvm.org/PR50623
...and the similar tests here, we were not accounting for
store merging of different sizes that do not cover the
entire range of the wide value to be stored.

This is the easy fix: just make sure that all of the
original stores are the same size, so when we calculate
the wide width, it's a simple N * M check.

This still allows all of the motivating optimizations from:
D86420 / 54a5dd485c4d
D87112 / 7a06b166b1af

We could enhance this code to track individual bytes and
allow merging multiple sizes.

(cherry picked from commit dd763ac79196b3d3bc0370b9dbd35e0c083e52a4)
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 24 ++++++++++++-------
 llvm/test/CodeGen/X86/stores-merging.ll       | 11 +++++----
 2 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 6a6f83827f72..7f2add81e80d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -7105,14 +7105,22 @@ SDValue DAGCombiner::mergeTruncStores(StoreSDNode *N) {
   if (LegalOperations)
     return SDValue();
 
-  // Collect all the stores in the chain.
-  SDValue Chain;
-  SmallVector<StoreSDNode *, 8> Stores;
-  for (StoreSDNode *Store = N; Store; Store = dyn_cast<StoreSDNode>(Chain)) {
-    // TODO: Allow unordered atomics when wider type is legal (see D66309)
-    EVT MemVT = Store->getMemoryVT();
-    if (!(MemVT == MVT::i8 || MemVT == MVT::i16 || MemVT == MVT::i32) ||
-        !Store->isSimple() || Store->isIndexed())
+  // We only handle merging simple stores of 1-4 bytes.
+  // TODO: Allow unordered atomics when wider type is legal (see D66309)
+  EVT MemVT = N->getMemoryVT();
+  if (!(MemVT == MVT::i8 || MemVT == MVT::i16 || MemVT == MVT::i32) ||
+      !N->isSimple() || N->isIndexed())
+    return SDValue();
+
+  // Collect all of the stores in the chain.
+  SDValue Chain = N->getChain();
+  SmallVector<StoreSDNode *, 8> Stores = {N};
+  while (auto *Store = dyn_cast<StoreSDNode>(Chain)) {
+    // All stores must be the same size to ensure that we are writing all of the
+    // bytes in the wide value.
+    // TODO: We could allow multiple sizes by tracking each stored byte.
+    if (Store->getMemoryVT() != MemVT || !Store->isSimple() ||
+        Store->isIndexed())
       return SDValue();
     Stores.push_back(Store);
     Chain = Store->getChain();
diff --git a/llvm/test/CodeGen/X86/stores-merging.ll b/llvm/test/CodeGen/X86/stores-merging.ll
index d4857a6645af..f738710ab8f3 100644
--- a/llvm/test/CodeGen/X86/stores-merging.ll
+++ b/llvm/test/CodeGen/X86/stores-merging.ll
@@ -614,14 +614,15 @@ define dso_local void @be_i64_to_i32_order(i64 %x, i32* %p0) {
 }
 
 ; https://llvm.org/PR50623
-; FIXME:
 ; It is a miscompile to merge the stores if we are not
 ; writing all of the bytes from the source value.
 
 define void @merge_hole(i32 %x, i8* %p) {
 ; CHECK-LABEL: merge_hole:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, (%rsi)
+; CHECK-NEXT:    movb %dil, (%rsi)
+; CHECK-NEXT:    shrl $16, %edi
+; CHECK-NEXT:    movw %di, 2(%rsi)
 ; CHECK-NEXT:    retq
   %pcast = bitcast i8* %p to i16*
   %p2 = getelementptr inbounds i16, i16* %pcast, i64 1
@@ -678,15 +679,15 @@ define void @merge_hole3(i32 %x, i8* %p) {
 }
 
 ; Change offset.
-; FIXME:
 ; It is a miscompile to merge the stores if we are not
 ; writing all of the bytes from the source value.
 
 define void @merge_hole4(i32 %x, i8* %p) {
 ; CHECK-LABEL: merge_hole4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    rorl $16, %edi
-; CHECK-NEXT:    movl %edi, (%rsi)
+; CHECK-NEXT:    movb %dil, 2(%rsi)
+; CHECK-NEXT:    shrl $16, %edi
+; CHECK-NEXT:    movw %di, (%rsi)
 ; CHECK-NEXT:    retq
   %pcast = bitcast i8* %p to i16*
   %p2 = getelementptr inbounds i8, i8* %p, i64 2

From 319a27b4211f49f6c8808be2b193208b62fb53c0 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Fri, 5 Mar 2021 14:29:57 +0100
Subject: [PATCH 270/318] [BPF] Add support for floats and doubles

Some BPF programs compiled on s390 fail to load, because s390
arch-specific linux headers contain float and double types. At the
moment there is no BTF_KIND for floats and doubles, so the release
version of LLVM ends up emitting type id 0 for them, which the
in-kernel verifier does not accept.

Introduce support for such types to libbpf by representing them using
the new BTF_KIND_FLOAT.

Reviewed By: yonghong-song

Differential Revision: https://reviews.llvm.org/D83289

(cherry picked from commit a7137b238a07d9399d3ae96c0b461571bd5aa8b2)
---
 llvm/lib/Target/BPF/BTF.def         |  1 +
 llvm/lib/Target/BPF/BTFDebug.cpp    | 43 ++++++++++++++++-----
 llvm/lib/Target/BPF/BTFDebug.h      |  9 +++++
 llvm/test/CodeGen/BPF/BTF/double.ll | 58 +++++++++++++++++++++++++++++
 llvm/test/CodeGen/BPF/BTF/float.ll  | 58 +++++++++++++++++++++++++++++
 5 files changed, 160 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/CodeGen/BPF/BTF/double.ll
 create mode 100644 llvm/test/CodeGen/BPF/BTF/float.ll

diff --git a/llvm/lib/Target/BPF/BTF.def b/llvm/lib/Target/BPF/BTF.def
index 2d2e9a04aa6d..66cf2c90ead4 100644
--- a/llvm/lib/Target/BPF/BTF.def
+++ b/llvm/lib/Target/BPF/BTF.def
@@ -30,5 +30,6 @@ HANDLE_BTF_KIND(12, FUNC)
 HANDLE_BTF_KIND(13, FUNC_PROTO)
 HANDLE_BTF_KIND(14, VAR)
 HANDLE_BTF_KIND(15, DATASEC)
+HANDLE_BTF_KIND(16, FLOAT)
 
 #undef HANDLE_BTF_KIND
diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index f9bdffe7cbae..da7ec32703a5 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -371,6 +371,21 @@ void BTFKindDataSec::emitType(MCStreamer &OS) {
   }
 }
 
+BTFTypeFloat::BTFTypeFloat(uint32_t SizeInBits, StringRef TypeName)
+    : Name(TypeName) {
+  Kind = BTF::BTF_KIND_FLOAT;
+  BTFType.Info = Kind << 24;
+  BTFType.Size = roundupToBytes(SizeInBits);
+}
+
+void BTFTypeFloat::completeType(BTFDebug &BDebug) {
+  if (IsCompleted)
+    return;
+  IsCompleted = true;
+
+  BTFType.NameOff = BDebug.addString(Name);
+}
+
 uint32_t BTFStringTable::addString(StringRef S) {
   // Check whether the string already exists.
   for (auto &OffsetM : OffsetToIdMap) {
@@ -409,18 +424,28 @@ uint32_t BTFDebug::addType(std::unique_ptr<BTFTypeBase> TypeEntry) {
 }
 
 void BTFDebug::visitBasicType(const DIBasicType *BTy, uint32_t &TypeId) {
-  // Only int types are supported in BTF.
+  // Only int and binary floating point types are supported in BTF.
   uint32_t Encoding = BTy->getEncoding();
-  if (Encoding != dwarf::DW_ATE_boolean && Encoding != dwarf::DW_ATE_signed &&
-      Encoding != dwarf::DW_ATE_signed_char &&
-      Encoding != dwarf::DW_ATE_unsigned &&
-      Encoding != dwarf::DW_ATE_unsigned_char)
+  std::unique_ptr<BTFTypeBase> TypeEntry;
+  switch (Encoding) {
+  case dwarf::DW_ATE_boolean:
+  case dwarf::DW_ATE_signed:
+  case dwarf::DW_ATE_signed_char:
+  case dwarf::DW_ATE_unsigned:
+  case dwarf::DW_ATE_unsigned_char:
+    // Create a BTF type instance for this DIBasicType and put it into
+    // DIToIdMap for cross-type reference check.
+    TypeEntry = std::make_unique<BTFTypeInt>(
+        Encoding, BTy->getSizeInBits(), BTy->getOffsetInBits(), BTy->getName());
+    break;
+  case dwarf::DW_ATE_float:
+    TypeEntry =
+        std::make_unique<BTFTypeFloat>(BTy->getSizeInBits(), BTy->getName());
+    break;
+  default:
     return;
+  }
 
-  // Create a BTF type instance for this DIBasicType and put it into
-  // DIToIdMap for cross-type reference check.
-  auto TypeEntry = std::make_unique<BTFTypeInt>(
-      Encoding, BTy->getSizeInBits(), BTy->getOffsetInBits(), BTy->getName());
   TypeId = addType(std::move(TypeEntry), BTy);
 }
 
diff --git a/llvm/lib/Target/BPF/BTFDebug.h b/llvm/lib/Target/BPF/BTFDebug.h
index 1bad0d11fee4..fb20ec59574b 100644
--- a/llvm/lib/Target/BPF/BTFDebug.h
+++ b/llvm/lib/Target/BPF/BTFDebug.h
@@ -195,6 +195,15 @@ class BTFKindDataSec : public BTFTypeBase {
   void emitType(MCStreamer &OS) override;
 };
 
+/// Handle binary floating point type.
+class BTFTypeFloat : public BTFTypeBase {
+  StringRef Name;
+
+public:
+  BTFTypeFloat(uint32_t SizeInBits, StringRef TypeName);
+  void completeType(BTFDebug &BDebug) override;
+};
+
 /// String table.
 class BTFStringTable {
   /// String table size in bytes.
diff --git a/llvm/test/CodeGen/BPF/BTF/double.ll b/llvm/test/CodeGen/BPF/BTF/double.ll
new file mode 100644
index 000000000000..655c3710d597
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/BTF/double.ll
@@ -0,0 +1,58 @@
+; RUN: llc -march=bpfel -filetype=asm -o - %s | FileCheck -check-prefixes=CHECK %s
+; RUN: llc -march=bpfeb -filetype=asm -o - %s | FileCheck -check-prefixes=CHECK %s
+
+; Source code:
+;   double a;
+; Compilation flag:
+;   clang -target bpf -O2 -g -S -emit-llvm t.c
+
+@a = dso_local local_unnamed_addr global double 0.000000e+00, align 8, !dbg !0
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!7, !8, !9}
+!llvm.ident = !{!10}
+
+; CHECK:             .section .BTF,"",@progbits
+; CHECK-NEXT:        .short 60319 # 0xeb9f
+; CHECK-NEXT:        .byte 1
+; CHECK-NEXT:        .byte 0
+; CHECK-NEXT:        .long 24
+; CHECK-NEXT:        .long 0
+; CHECK-NEXT:        .long 52
+; CHECK-NEXT:        .long 52
+; CHECK-NEXT:        .long 15
+; [1] double, size=8 bytes (64 bits)
+; CHECK-NEXT:        .long 1 # BTF_KIND_FLOAT(id = 1)
+; CHECK-NEXT:        .long 268435456 # 0x10000000
+; CHECK-NEXT:        .long 8
+; [2] a, type=double (1), global
+; CHECK-NEXT:        .long 8 # BTF_KIND_VAR(id = 2)
+; CHECK-NEXT:        .long 234881024 # 0xe000000
+; CHECK-NEXT:        .long 1
+; CHECK-NEXT:        .long 1
+; [3] .bss, 1 var, {a, offset=&a, size=8 bytes}
+; CHECK-NEXT:        .long 10 # BTF_KIND_DATASEC(id = 3)
+; CHECK-NEXT:        .long 251658241 # 0xf000001
+; CHECK-NEXT:        .long 0
+; CHECK-NEXT:        .long 2
+; CHECK-NEXT:        .long a
+; CHECK-NEXT:        .long 8
+; CHECK-NEXT:        .byte 0 # string offset=0
+; CHECK-NEXT:        .ascii "double" # string offset=1
+; CHECK-NEXT:        .byte 0
+; CHECK-NEXT:        .byte 97 # string offset=8
+; CHECK-NEXT:        .byte 0
+; CHECK-NEXT:        .ascii ".bss" # string offset=10
+; CHECK-NEXT:        .byte 0
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "a", scope: !2, file: !3, line: 1, type: !6, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 11.0.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5, splitDebugInlining: false, nameTableKind: None)
+!3 = !DIFile(filename: "t.c", directory: "/home/yhs/tmp")
+!4 = !{}
+!5 = !{!0}
+!6 = !DIBasicType(name: "double", size: 64, encoding: DW_ATE_float)
+!7 = !{i32 7, !"Dwarf Version", i32 4}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{i32 1, !"wchar_size", i32 4}
+!10 = !{!"clang version 11.0.0 "}
diff --git a/llvm/test/CodeGen/BPF/BTF/float.ll b/llvm/test/CodeGen/BPF/BTF/float.ll
new file mode 100644
index 000000000000..a061263eed7d
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/BTF/float.ll
@@ -0,0 +1,58 @@
+; RUN: llc -march=bpfel -filetype=asm -o - %s | FileCheck -check-prefixes=CHECK %s
+; RUN: llc -march=bpfeb -filetype=asm -o - %s | FileCheck -check-prefixes=CHECK %s
+
+; Source code:
+;   float a;
+; Compilation flag:
+;   clang -target bpf -O2 -g -S -emit-llvm t.c
+
+@a = dso_local local_unnamed_addr global float 0.000000e+00, align 4, !dbg !0
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!7, !8, !9}
+!llvm.ident = !{!10}
+
+; CHECK:             .section .BTF,"",@progbits
+; CHECK-NEXT:        .short 60319 # 0xeb9f
+; CHECK-NEXT:        .byte 1
+; CHECK-NEXT:        .byte 0
+; CHECK-NEXT:        .long 24
+; CHECK-NEXT:        .long 0
+; CHECK-NEXT:        .long 52
+; CHECK-NEXT:        .long 52
+; CHECK-NEXT:        .long 14
+; [1] float, size=4 bytes (32 bits)
+; CHECK-NEXT:        .long 1 # BTF_KIND_FLOAT(id = 1)
+; CHECK-NEXT:        .long 268435456 # 0x10000000
+; CHECK-NEXT:        .long 4
+; [2] a, type=float (1), global
+; CHECK-NEXT:        .long 7 # BTF_KIND_VAR(id = 2)
+; CHECK-NEXT:        .long 234881024 # 0xe000000
+; CHECK-NEXT:        .long 1
+; CHECK-NEXT:        .long 1
+; [3] .bss, 1 var, {a, offset=&a, size=4 bytes}
+; CHECK-NEXT:        .long 9 # BTF_KIND_DATASEC(id = 3)
+; CHECK-NEXT:        .long 251658241 # 0xf000001
+; CHECK-NEXT:        .long 0
+; CHECK-NEXT:        .long 2
+; CHECK-NEXT:        .long a
+; CHECK-NEXT:        .long 4
+; CHECK-NEXT:        .byte 0 # string offset=0
+; CHECK-NEXT:        .ascii "float" # string offset=1
+; CHECK-NEXT:        .byte 0
+; CHECK-NEXT:        .byte 97 # string offset=7
+; CHECK-NEXT:        .byte 0
+; CHECK-NEXT:        .ascii ".bss" # string offset=9
+; CHECK-NEXT:        .byte 0
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "a", scope: !2, file: !3, line: 1, type: !6, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 11.0.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5, splitDebugInlining: false, nameTableKind: None)
+!3 = !DIFile(filename: "t.c", directory: "/home/yhs/tmp")
+!4 = !{}
+!5 = !{!0}
+!6 = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float)
+!7 = !{i32 7, !"Dwarf Version", i32 4}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{i32 1, !"wchar_size", i32 4}
+!10 = !{!"clang version 11.0.0 "}

From 7f6ceec93541633993b08cc97703c0771c1977c8 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 25 Mar 2021 14:09:19 -0700
Subject: [PATCH 271/318] BPF: add extern func to data sections if specified

This permits extern function (BTF_KIND_FUNC) be added
to BTF_KIND_DATASEC if a section name is specified.
For example,

-bash-4.4$ cat t.c
void foo(int) __attribute__((section(".kernel.funcs")));
int test(void) {
  foo(5);
  return 0;
}

The extern function foo (BTF_KIND_FUNC) will be put into
BTF_KIND_DATASEC with name ".kernel.funcs".

This will help to differentiate two kinds of external functions,
functions in kernel and functions defined in other bpf programs.

Differential Revision: https://reviews.llvm.org/D93563

(cherry picked from commit 886f9ff53155075bd5f1e994f17b85d1e1b7470c)
---
 llvm/lib/Target/BPF/BTFDebug.cpp               | 18 +++++++++++++++---
 llvm/lib/Target/BPF/BTFDebug.h                 |  2 +-
 .../BPF/BTF/extern-var-func-weak-section.ll    | 13 ++++++++++---
 .../test/CodeGen/BPF/BTF/extern-var-section.ll |  9 ++++++---
 .../CodeGen/BPF/BTF/extern-var-weak-section.ll |  9 ++++++---
 5 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index da7ec32703a5..bedf159430dc 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -1224,8 +1224,8 @@ void BTFDebug::processGlobals(bool ProcessingMapDef) {
     const DataLayout &DL = Global.getParent()->getDataLayout();
     uint32_t Size = DL.getTypeAllocSize(Global.getType()->getElementType());
 
-    DataSecEntries[std::string(SecName)]->addVar(VarId, Asm->getSymbol(&Global),
-                                                 Size);
+    DataSecEntries[std::string(SecName)]->addDataSecEntry(VarId,
+        Asm->getSymbol(&Global), Size);
   }
 }
 
@@ -1303,7 +1303,19 @@ void BTFDebug::processFuncPrototypes(const Function *F) {
   uint8_t Scope = BTF::FUNC_EXTERN;
   auto FuncTypeEntry =
       std::make_unique<BTFTypeFunc>(SP->getName(), ProtoTypeId, Scope);
-  addType(std::move(FuncTypeEntry));
+  uint32_t FuncId = addType(std::move(FuncTypeEntry));
+  if (F->hasSection()) {
+    StringRef SecName = F->getSection();
+
+    if (DataSecEntries.find(std::string(SecName)) == DataSecEntries.end()) {
+      DataSecEntries[std::string(SecName)] =
+          std::make_unique<BTFKindDataSec>(Asm, std::string(SecName));
+    }
+
+    // We really don't know func size, set it to 0.
+    DataSecEntries[std::string(SecName)]->addDataSecEntry(FuncId,
+        Asm->getSymbol(F), 0);
+  }
 }
 
 void BTFDebug::endModule() {
diff --git a/llvm/lib/Target/BPF/BTFDebug.h b/llvm/lib/Target/BPF/BTFDebug.h
index fb20ec59574b..76f1901779bb 100644
--- a/llvm/lib/Target/BPF/BTFDebug.h
+++ b/llvm/lib/Target/BPF/BTFDebug.h
@@ -187,7 +187,7 @@ class BTFKindDataSec : public BTFTypeBase {
   uint32_t getSize() override {
     return BTFTypeBase::getSize() + BTF::BTFDataSecVarSize * Vars.size();
   }
-  void addVar(uint32_t Id, const MCSymbol *Sym, uint32_t Size) {
+  void addDataSecEntry(uint32_t Id, const MCSymbol *Sym, uint32_t Size) {
     Vars.push_back(std::make_tuple(Id, Sym, Size));
   }
   std::string getName() { return Name; }
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak-section.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak-section.ll
index 23332c9d9aa1..d47a9d6c504a 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak-section.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak-section.ll
@@ -23,9 +23,9 @@ declare !dbg !4 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed
 ; CHECK-NEXT:        .byte   0
 ; CHECK-NEXT:        .long   24
 ; CHECK-NEXT:        .long   0
-; CHECK-NEXT:        .long   88
-; CHECK-NEXT:        .long   88
-; CHECK-NEXT:        .long   72
+; CHECK-NEXT:        .long   112
+; CHECK-NEXT:        .long   112
+; CHECK-NEXT:        .long   76
 ; CHECK-NEXT:        .long   0                       # BTF_KIND_FUNC_PROTO(id = 1)
 ; CHECK-NEXT:        .long   218103808               # 0xd000000
 ; CHECK-NEXT:        .long   2
@@ -48,6 +48,12 @@ declare !dbg !4 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed
 ; CHECK-NEXT:        .long   60                      # BTF_KIND_FUNC(id = 6)
 ; CHECK-NEXT:        .long   201326594               # 0xc000002
 ; CHECK-NEXT:        .long   4
+; CHECK-NEXT:        .long   72                      # BTF_KIND_DATASEC(id = 7)
+; CHECK-NEXT:        .long   251658241               # 0xf000001
+; CHECK-NEXT:        .long   0
+; CHECK-NEXT:        .long   6
+; CHECK-NEXT:        .long   global_func
+; CHECK-NEXT:        .long   0
 ; CHECK-NEXT:        .byte   0                       # string offset=0
 ; CHECK-NEXT:        .ascii  "int"                   # string offset=1
 ; CHECK-NEXT:        .byte   0
@@ -61,6 +67,7 @@ declare !dbg !4 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed
 ; CHECK-NEXT:        .byte   0
 ; CHECK-NEXT:        .ascii  "global_func"           # string offset=60
 ; CHECK-NEXT:        .byte   0
+; CHECK-NEXT:        .ascii  "abc"                   # string offset=72
 
 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-section.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-section.ll
index e01da7e209fd..520847449950 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-section.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-section.ll
@@ -28,8 +28,8 @@ entry:
 ; CHECK-NEXT:        .byte   0
 ; CHECK-NEXT:        .long   24
 ; CHECK-NEXT:        .long   0
-; CHECK-NEXT:        .long   128
-; CHECK-NEXT:        .long   128
+; CHECK-NEXT:        .long   140
+; CHECK-NEXT:        .long   140
 ; CHECK-NEXT:        .long   79
 ; CHECK-NEXT:        .long   0                       # BTF_KIND_FUNC_PROTO(id = 1)
 ; CHECK-NEXT:        .long   218103808               # 0xd000000
@@ -58,7 +58,10 @@ entry:
 ; CHECK-NEXT:        .long   5
 ; CHECK-NEXT:        .long   2
 ; CHECK-NEXT:        .long   75                      # BTF_KIND_DATASEC(id = 8)
-; CHECK-NEXT:        .long   251658241               # 0xf000001
+; CHECK-NEXT:        .long   251658242               # 0xf000002
+; CHECK-NEXT:        .long   0
+; CHECK-NEXT:        .long   6
+; CHECK-NEXT:        .long   global_func
 ; CHECK-NEXT:        .long   0
 ; CHECK-NEXT:        .long   7
 ; CHECK-NEXT:        .long   ch
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-weak-section.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-weak-section.ll
index 6e64d9b4e482..bdf7fb49c560 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-weak-section.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-weak-section.ll
@@ -28,8 +28,8 @@ declare !dbg !6 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed
 ; CHECK-NEXT:        .byte   0
 ; CHECK-NEXT:        .long   24
 ; CHECK-NEXT:        .long   0
-; CHECK-NEXT:        .long   128
-; CHECK-NEXT:        .long   128
+; CHECK-NEXT:        .long   140
+; CHECK-NEXT:        .long   140
 ; CHECK-NEXT:        .long   79
 ; CHECK-NEXT:        .long   0                       # BTF_KIND_FUNC_PROTO(id = 1)
 ; CHECK-NEXT:        .long   218103808               # 0xd000000
@@ -58,7 +58,10 @@ declare !dbg !6 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed
 ; CHECK-NEXT:        .long   5
 ; CHECK-NEXT:        .long   2
 ; CHECK-NEXT:        .long   75                      # BTF_KIND_DATASEC(id = 8)
-; CHECK-NEXT:        .long   251658241               # 0xf000001
+; CHECK-NEXT:        .long   251658242               # 0xf000002
+; CHECK-NEXT:        .long   0
+; CHECK-NEXT:        .long   6
+; CHECK-NEXT:        .long   global_func
 ; CHECK-NEXT:        .long   0
 ; CHECK-NEXT:        .long   7
 ; CHECK-NEXT:        .long   ch

From 5b149c437194d10877e9e45b3d8cc9252af1944b Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 12 Apr 2021 22:45:51 -0700
Subject: [PATCH 272/318] BPF: generate proper BTF for globals with
 WeakODRLinkage

For a global weak symbol defined as below:
  char g __attribute__((weak)) = 2;
LLVM generates an allocated global with WeakAnyLinkage,
for which BPF backend generates proper BTF info.

For the above example, if a modifier "const" is added like
  const char g __attribute__((weak)) = 2;
LLVM generates an allocated global with WeakODRLinkage,
for which BPF backend didn't generate any BTF as it
didn't handle WeakODRLinkage.

This patch addes support for WeakODRLinkage and proper
BTF info can be generated for weak symbol defined with
"const" modifier.

Differential Revision: https://reviews.llvm.org/D100362

(cherry picked from commit 968292cb93198442138128d850fd54dc7edc0035)
---
 llvm/lib/Target/BPF/BTFDebug.cpp           |  1 +
 llvm/test/CodeGen/BPF/BTF/weak-global-3.ll | 86 ++++++++++++++++++++++
 2 files changed, 87 insertions(+)
 create mode 100644 llvm/test/CodeGen/BPF/BTF/weak-global-3.ll

diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index bedf159430dc..9249d679c7bd 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -1196,6 +1196,7 @@ void BTFDebug::processGlobals(bool ProcessingMapDef) {
     if (Linkage != GlobalValue::InternalLinkage &&
         Linkage != GlobalValue::ExternalLinkage &&
         Linkage != GlobalValue::WeakAnyLinkage &&
+        Linkage != GlobalValue::WeakODRLinkage &&
         Linkage != GlobalValue::ExternalWeakLinkage)
       continue;
 
diff --git a/llvm/test/CodeGen/BPF/BTF/weak-global-3.ll b/llvm/test/CodeGen/BPF/BTF/weak-global-3.ll
new file mode 100644
index 000000000000..dbd6380a9f1a
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/BTF/weak-global-3.ll
@@ -0,0 +1,86 @@
+; RUN: llc -march=bpfel -filetype=asm -o - %s | FileCheck -check-prefixes=CHECK %s
+; RUN: llc -march=bpfeb -filetype=asm -o - %s | FileCheck -check-prefixes=CHECK %s
+;
+; Source code:
+;   const volatile char g __attribute__((weak)) = 2;
+;   int test() {
+;     return g;
+;   }
+; Compilation flag:
+;   clang -target bpf -O2 -g -S -emit-llvm test.c
+
+@g = weak_odr dso_local constant i8 2, align 1, !dbg !0
+
+; Function Attrs: nofree norecurse nounwind willreturn
+define dso_local i32 @test() local_unnamed_addr #0 !dbg !13 {
+entry:
+  %0 = load volatile i8, i8* @g, align 1, !dbg !17, !tbaa !18
+  %conv = sext i8 %0 to i32, !dbg !17
+  ret i32 %conv, !dbg !21
+}
+
+; CHECK:             .long   0                               # BTF_KIND_FUNC_PROTO(id = 1)
+; CHECK-NEXT:        .long   218103808                       # 0xd000000
+; CHECK-NEXT:        .long   2
+; CHECK-NEXT:        .long   1                               # BTF_KIND_INT(id = 2)
+; CHECK-NEXT:        .long   16777216                        # 0x1000000
+; CHECK-NEXT:        .long   4
+; CHECK-NEXT:        .long   16777248                        # 0x1000020
+; CHECK-NEXT:        .long   5                               # BTF_KIND_FUNC(id = 3)
+; CHECK-NEXT:        .long   201326593                       # 0xc000001
+; CHECK-NEXT:        .long   1
+; CHECK-NEXT:        .long   0                               # BTF_KIND_CONST(id = 4)
+; CHECK-NEXT:        .long   167772160                       # 0xa000000
+; CHECK-NEXT:        .long   5
+; CHECK-NEXT:        .long   0                               # BTF_KIND_VOLATILE(id = 5)
+; CHECK-NEXT:        .long   150994944                       # 0x9000000
+; CHECK-NEXT:        .long   6
+; CHECK-NEXT:        .long   47                              # BTF_KIND_INT(id = 6)
+; CHECK-NEXT:        .long   16777216                        # 0x1000000
+; CHECK-NEXT:        .long   1
+; CHECK-NEXT:        .long   16777224                        # 0x1000008
+; CHECK-NEXT:        .long   52                              # BTF_KIND_VAR(id = 7)
+; CHECK-NEXT:        .long   234881024                       # 0xe000000
+; CHECK-NEXT:        .long   4
+; CHECK-NEXT:        .long   1
+; CHECK-NEXT:        .long   54                              # BTF_KIND_DATASEC(id = 8)
+; CHECK-NEXT:        .long   251658241                       # 0xf000001
+; CHECK-NEXT:        .long   0
+; CHECK-NEXT:        .long   7
+; CHECK-NEXT:        .long   g
+; CHECK-NEXT:        .long   1
+
+; CHECK:             .ascii  "int"                           # string offset=1
+; CHECK:             .ascii  "test"                          # string offset=5
+; CHECK:             .ascii  "char"                          # string offset=47
+; CHECK:             .byte   103                             # string offset=52
+; CHECK:             .ascii  ".rodata"                       # string offset=54
+
+attributes #0 = { nofree norecurse nounwind willreturn "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!9, !10, !11}
+!llvm.ident = !{!12}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "g", scope: !2, file: !3, line: 1, type: !6, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 13.0.0 (https://github.com/llvm/llvm-project.git 9cc417cbca1cece0d55fa3d1e15682943a06139e)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5, splitDebugInlining: false, nameTableKind: None)
+!3 = !DIFile(filename: "test.c", directory: "/tmp/home/yhs/btf/tests")
+!4 = !{}
+!5 = !{!0}
+!6 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !7)
+!7 = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: !8)
+!8 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
+!9 = !{i32 7, !"Dwarf Version", i32 4}
+!10 = !{i32 2, !"Debug Info Version", i32 3}
+!11 = !{i32 1, !"wchar_size", i32 4}
+!12 = !{!"clang version 13.0.0 (https://github.com/llvm/llvm-project.git 9cc417cbca1cece0d55fa3d1e15682943a06139e)"}
+!13 = distinct !DISubprogram(name: "test", scope: !3, file: !3, line: 2, type: !14, scopeLine: 2, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
+!14 = !DISubroutineType(types: !15)
+!15 = !{!16}
+!16 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!17 = !DILocation(line: 3, column: 10, scope: !13)
+!18 = !{!19, !19, i64 0}
+!19 = !{!"omnipotent char", !20, i64 0}
+!20 = !{!"Simple C/C++ TBAA"}
+!21 = !DILocation(line: 3, column: 3, scope: !13)

From ce779098006e9de32678f28140f22de7e4a19189 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Fri, 30 Apr 2021 22:38:40 +0200
Subject: [PATCH 273/318] [ValueTracking] Limit scan when checking poison UB
 (PR50155)

The current code can scan an unlimited number of instructions,
if the containing basic block is very large. The test case from
PR50155 contains a basic block with approximately 100k instructions.

To avoid this, limit the number of instructions we inspect. At
the same time, drop the limit on the number of basic blocks, as
this will be implicitly limited by the number of instructions as
well.

(cherry picked from commit 2cd78686055f1badb9aa55cb95e189548ffc82f0)
---
 llvm/lib/Analysis/ValueTracking.cpp | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index e174c5efe424..75486d3c80e7 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -5150,6 +5150,9 @@ static bool programUndefinedIfUndefOrPoison(const Value *V,
     return false;
   }
 
+  // Limit number of instructions we look at, to avoid scanning through large
+  // blocks. The current limit is chosen arbitrarily.
+  unsigned ScanLimit = 32;
   BasicBlock::const_iterator End = BB->end();
 
   if (!PoisonOnly) {
@@ -5160,6 +5163,11 @@ static bool programUndefinedIfUndefOrPoison(const Value *V,
     // For example, 'udiv x, (undef | 1)' isn't UB.
 
     for (auto &I : make_range(Begin, End)) {
+      if (isa<DbgInfoIntrinsic>(I))
+        continue;
+      if (--ScanLimit == 0)
+        break;
+
       if (const auto *CB = dyn_cast<CallBase>(&I)) {
         for (unsigned i = 0; i < CB->arg_size(); ++i) {
           if (CB->paramHasAttr(i, Attribute::NoUndef) &&
@@ -5186,9 +5194,12 @@ static bool programUndefinedIfUndefOrPoison(const Value *V,
   for_each(V->users(), Propagate);
   Visited.insert(BB);
 
-  unsigned Iter = 0;
-  while (Iter++ < MaxAnalysisRecursionDepth) {
+  while (true) {
     for (auto &I : make_range(Begin, End)) {
+      if (isa<DbgInfoIntrinsic>(I))
+        continue;
+      if (--ScanLimit == 0)
+        return false;
       if (mustTriggerUB(&I, YieldsPoison))
         return true;
       if (!isGuaranteedToTransferExecutionToSuccessor(&I))

From 7cc5b1593554c4a589e5e57c64c28bbd85c9987f Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Wed, 24 Mar 2021 09:24:24 -0700
Subject: [PATCH 274/318] [OPENMP]Fix PR48571: critical/master in outlined
 contexts cause crash.

If emit inlined region for master/critical directives, no need to clear
lambda/block context data, otherwise the variables cannot be found and
it causes a crash at compile time.

Differential Revision: https://reviews.llvm.org/D99280

(cherry picked from commit 7654bb6303d290b19cad29137be810e69a0bf917)
---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp  | 32 ++++++++++++++---------
 clang/test/OpenMP/critical_codegen.cpp | 25 ++++++++++++++++++
 clang/test/OpenMP/master_codegen.cpp   | 35 ++++++++++++++++++++++++++
 3 files changed, 80 insertions(+), 12 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 83dfa0780547..caa5291ff6fa 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -409,6 +409,7 @@ class InlinedOpenMPRegionRAII {
   llvm::DenseMap<const VarDecl *, FieldDecl *> LambdaCaptureFields;
   FieldDecl *LambdaThisCaptureField = nullptr;
   const CodeGen::CGBlockInfo *BlockInfo = nullptr;
+  bool NoInheritance = false;
 
 public:
   /// Constructs region for combined constructs.
@@ -416,16 +417,19 @@ class InlinedOpenMPRegionRAII {
   /// a list of functions used for code generation of implicitly inlined
   /// regions.
   InlinedOpenMPRegionRAII(CodeGenFunction &CGF, const RegionCodeGenTy &CodeGen,
-                          OpenMPDirectiveKind Kind, bool HasCancel)
-      : CGF(CGF) {
+                          OpenMPDirectiveKind Kind, bool HasCancel,
+                          bool NoInheritance = true)
+      : CGF(CGF), NoInheritance(NoInheritance) {
     // Start emission for the construct.
     CGF.CapturedStmtInfo = new CGOpenMPInlinedRegionInfo(
         CGF.CapturedStmtInfo, CodeGen, Kind, HasCancel);
-    std::swap(CGF.LambdaCaptureFields, LambdaCaptureFields);
-    LambdaThisCaptureField = CGF.LambdaThisCaptureField;
-    CGF.LambdaThisCaptureField = nullptr;
-    BlockInfo = CGF.BlockInfo;
-    CGF.BlockInfo = nullptr;
+    if (NoInheritance) {
+      std::swap(CGF.LambdaCaptureFields, LambdaCaptureFields);
+      LambdaThisCaptureField = CGF.LambdaThisCaptureField;
+      CGF.LambdaThisCaptureField = nullptr;
+      BlockInfo = CGF.BlockInfo;
+      CGF.BlockInfo = nullptr;
+    }
   }
 
   ~InlinedOpenMPRegionRAII() {
@@ -434,9 +438,11 @@ class InlinedOpenMPRegionRAII {
         cast<CGOpenMPInlinedRegionInfo>(CGF.CapturedStmtInfo)->getOldCSI();
     delete CGF.CapturedStmtInfo;
     CGF.CapturedStmtInfo = OldCSI;
-    std::swap(CGF.LambdaCaptureFields, LambdaCaptureFields);
-    CGF.LambdaThisCaptureField = LambdaThisCaptureField;
-    CGF.BlockInfo = BlockInfo;
+    if (NoInheritance) {
+      std::swap(CGF.LambdaCaptureFields, LambdaCaptureFields);
+      CGF.LambdaThisCaptureField = LambdaThisCaptureField;
+      CGF.BlockInfo = BlockInfo;
+    }
   }
 };
 
@@ -3853,7 +3859,7 @@ static void emitPrivatesInit(CodeGenFunction &CGF,
           // Processing for implicitly captured variables.
           InlinedOpenMPRegionRAII Region(
               CGF, [](CodeGenFunction &, PrePostActionTy &) {}, OMPD_unknown,
-              /*HasCancel=*/false);
+              /*HasCancel=*/false, /*NoInheritance=*/true);
           SharedRefLValue = CGF.EmitLValue(Pair.second.OriginalRef);
         }
         if (Type->isArrayType()) {
@@ -6214,7 +6220,9 @@ void CGOpenMPRuntime::emitInlinedDirective(CodeGenFunction &CGF,
                                            bool HasCancel) {
   if (!CGF.HaveInsertPoint())
     return;
-  InlinedOpenMPRegionRAII Region(CGF, CodeGen, InnerKind, HasCancel);
+  InlinedOpenMPRegionRAII Region(CGF, CodeGen, InnerKind, HasCancel,
+                                 InnerKind != OMPD_critical &&
+                                     InnerKind != OMPD_master);
   CGF.CapturedStmtInfo->EmitBody(CGF, /*S=*/nullptr);
 }
 
diff --git a/clang/test/OpenMP/critical_codegen.cpp b/clang/test/OpenMP/critical_codegen.cpp
index 46fad63b3bd8..d84f2b2af22b 100644
--- a/clang/test/OpenMP/critical_codegen.cpp
+++ b/clang/test/OpenMP/critical_codegen.cpp
@@ -68,6 +68,31 @@ int main() {
   return a;
 }
 
+// ALL-LABEL:        lambda_critical
+// TERM_DEBUG-LABEL: lambda_critical
+void lambda_critical(int a, int b) {
+  auto l = [=]() {
+#pragma omp critical
+    {
+      // ALL: call void @__kmpc_critical(
+      int c = a + b;
+    }
+  };
+
+  l();
+
+  auto l1 = [=]() {
+#pragma omp parallel
+#pragma omp critical
+    {
+      // ALL: call void @__kmpc_critical(
+      int c = a + b;
+    }
+  };
+
+  l1();
+}
+
 struct S {
   int a;
 };
diff --git a/clang/test/OpenMP/master_codegen.cpp b/clang/test/OpenMP/master_codegen.cpp
index 8554ad8e7dec..353284ea8541 100644
--- a/clang/test/OpenMP/master_codegen.cpp
+++ b/clang/test/OpenMP/master_codegen.cpp
@@ -55,6 +55,41 @@ int main() {
   return a;
 }
 
+// ALL-LABEL:        lambda_master
+// TERM_DEBUG-LABEL: lambda_master
+void lambda_master(int a, int b) {
+  auto l = [=]() {
+#pragma omp master
+    {
+      // ALL: call i32 @__kmpc_master(
+      int c = a + b;
+    }
+  };
+
+  l();
+
+  auto l1 = [=]() {
+#pragma omp parallel
+#pragma omp master
+    {
+      // ALL: call i32 @__kmpc_master(
+      int c = a + b;
+    }
+  };
+
+  l1();
+
+  auto l2 = [=]() {
+#pragma omp parallel master
+    {
+      // ALL: call i32 @__kmpc_master(
+      int c = a + b;
+    }
+  };
+
+  l2();
+}
+
 // ALL-LABEL:      parallel_master
 // TERM_DEBUG-LABEL: parallel_master
 void parallel_master() {

From 3be5dbbc32dd348dddcec9e3baec96b7dcef8a35 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sguelton@redhat.com>
Date: Mon, 22 Mar 2021 10:05:25 +0100
Subject: [PATCH 275/318] Make clangd CompletionModel usable even with
 non-standard (but supported) layout

llvm supports specifying a non-standard layout where each project lies in its
own place. Do not assume a fixed layout and use the appropriate cmake variable
instead.

Differential Revision: https://reviews.llvm.org/D96787

(cherry picked from commit f51ab1871655a9a96134c2636c37dcb5a6b01ac3)
---
 clang-tools-extra/clangd/quality/CompletionModel.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang-tools-extra/clangd/quality/CompletionModel.cmake b/clang-tools-extra/clangd/quality/CompletionModel.cmake
index 60c6d2aa8433..41bc2ed1890b 100644
--- a/clang-tools-extra/clangd/quality/CompletionModel.cmake
+++ b/clang-tools-extra/clangd/quality/CompletionModel.cmake
@@ -5,8 +5,8 @@
 # will define a C++ class called ${cpp_class} - which may be a
 # namespace-qualified class name.
 function(gen_decision_forest model filename cpp_class)
-  set(model_compiler ${CMAKE_SOURCE_DIR}/../clang-tools-extra/clangd/quality/CompletionModelCodegen.py)
-  
+  set(model_compiler ${LLVM_EXTERNAL_CLANG_TOOLS_EXTRA_SOURCE_DIR}/clangd/quality/CompletionModelCodegen.py)
+
   set(output_dir ${CMAKE_CURRENT_BINARY_DIR})
   set(header_file ${output_dir}/${filename}.h)
   set(cpp_file ${output_dir}/${filename}.cpp)

From adae17728bad62fa9ce1635905d0b0bde30e3eba Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald@gigawatt.nl>
Date: Wed, 5 May 2021 19:25:34 +0100
Subject: [PATCH 276/318] Make clangd CompletionModel not depend on directory
 layout.

The current code accounts for two possible layouts, but there is at
least a third supported layout: clang-tools-extra may also be checked
out as clang/tools/extra with the releases, which was not yet handled.
Rather than treating that as a special case, use the location of
CompletionModel.cmake to handle all three cases. This should address the
problems that prompted D96787 and the problems that prompted the
proposed revert D100625.

Reviewed By: usaxena95

Differential Revision: https://reviews.llvm.org/D101851

(cherry picked from commit 7907c46fe6195728fafd843b8c0fb19a3e68e9ad)
---
 clang-tools-extra/clangd/quality/CompletionModel.cmake | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/clang-tools-extra/clangd/quality/CompletionModel.cmake b/clang-tools-extra/clangd/quality/CompletionModel.cmake
index 41bc2ed1890b..dc0c0cde4dab 100644
--- a/clang-tools-extra/clangd/quality/CompletionModel.cmake
+++ b/clang-tools-extra/clangd/quality/CompletionModel.cmake
@@ -4,8 +4,9 @@
 # ${CMAKE_CURRENT_BINARY_DIR}. The generated header
 # will define a C++ class called ${cpp_class} - which may be a
 # namespace-qualified class name.
+set(CLANGD_COMPLETION_MODEL_COMPILER ${CMAKE_CURRENT_LIST_DIR}/CompletionModelCodegen.py)
 function(gen_decision_forest model filename cpp_class)
-  set(model_compiler ${LLVM_EXTERNAL_CLANG_TOOLS_EXTRA_SOURCE_DIR}/clangd/quality/CompletionModelCodegen.py)
+  set(model_compiler ${CLANGD_COMPLETION_MODEL_COMPILER})
 
   set(output_dir ${CMAKE_CURRENT_BINARY_DIR})
   set(header_file ${output_dir}/${filename}.h)

From a066f4eb679488cfeed2fec24574adcb55f367a1 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Sat, 12 Jun 2021 09:49:32 -0700
Subject: [PATCH 277/318] [X86] Add ISD::FREEZE and ISD::AssertAlign to the
 list of opcodes that don't guarantee upper 32 bits are zero.

The freeze issue was reported here
https://llvm.discourse.group/t/bug-or-feature-freeze-instruction/3639

I don't have a test for AssertAlign. I just noticed it was missing
and assume it should be similar to the other two Asserts.

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D104178

(cherry picked from commit c997867dc084a1bcf631816f964b3ff49a297ba3)
---
 llvm/lib/Target/X86/X86InstrCompiler.td | 11 +++++++----
 llvm/test/CodeGen/X86/freeze.ll         | 23 +++++++++++++++++++++++
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index 7a2facf226d8..dc6361aecc60 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1344,15 +1344,18 @@ def : Pat<(i32 (anyext_sdiv GR8:$src)), (MOVSX32rr8 GR8:$src)>;
 
 // Any instruction that defines a 32-bit result leaves the high half of the
 // register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
-// be copying from a truncate. Any other 32-bit operation will zero-extend
-// up to 64 bits. AssertSext/AssertZext aren't saying anything about the upper
-// 32 bits, they're probably just qualifying a CopyFromReg.
+// be copying from a truncate. AssertSext/AssertZext/AssertAlign aren't saying
+// anything about the upper 32 bits, they're probably just qualifying a
+// CopyFromReg. FREEZE may be coming from a a truncate. Any other 32-bit
+// operation will zero-extend up to 64 bits.
 def def32 : PatLeaf<(i32 GR32:$src), [{
   return N->getOpcode() != ISD::TRUNCATE &&
          N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
          N->getOpcode() != ISD::CopyFromReg &&
          N->getOpcode() != ISD::AssertSext &&
-         N->getOpcode() != ISD::AssertZext;
+         N->getOpcode() != ISD::AssertZext &&
+         N->getOpcode() != ISD::AssertAlign &&
+         N->getOpcode() != ISD::FREEZE;
 }]>;
 
 // In the case of a 32-bit def that is known to implicitly zero-extend,
diff --git a/llvm/test/CodeGen/X86/freeze.ll b/llvm/test/CodeGen/X86/freeze.ll
index cf015d3c892c..7131f3a5bc4f 100644
--- a/llvm/test/CodeGen/X86/freeze.ll
+++ b/llvm/test/CodeGen/X86/freeze.ll
@@ -122,3 +122,26 @@ define i64 @freeze_array() {
   %t1 = add i64 %v1, %v2
   ret i64 %t1
 }
+
+; Make sure we emit a movl to zext the input before the imulq. This previously
+; failed because freeze was not listed in the instructions that don't zext their
+; result in the def32 pattern X86InstrCompiler.td.
+define i32 @freeze_zext(i64 %a) nounwind {
+; X86ASM-LABEL: freeze_zext:
+; X86ASM:       # %bb.0: # %entry
+; X86ASM-NEXT:    movq %rdi, %rax
+; X86ASM-NEXT:    movl %eax, %ecx
+; X86ASM-NEXT:    movl $3435973837, %edx # imm = 0xCCCCCCCD
+; X86ASM-NEXT:    imulq %rcx, %rdx
+; X86ASM-NEXT:    shrq $35, %rdx
+; X86ASM-NEXT:    addl %edx, %edx
+; X86ASM-NEXT:    leal (%rdx,%rdx,4), %ecx
+; X86ASM-NEXT:    subl %ecx, %eax
+; X86ASM-NEXT:    # kill: def $eax killed $eax killed $rax
+; X86ASM-NEXT:    retq
+entry:
+  %x = trunc i64 %a to i32
+  %y = freeze i32 %x
+  %z = urem i32 %y, 10
+  ret i32 %z
+}

From a95bf588bd727fa71486098d76b8a1bc00650361 Mon Sep 17 00:00:00 2001
From: Yvan Roux <yvan.roux@linaro.org>
Date: Wed, 9 Jun 2021 15:30:50 +0200
Subject: [PATCH 278/318] [ARM] Fix Machine Outliner LDRD/STRD handling in
 Thumb mode.

This is a fix for PR50481

Immediate values for AddrModeT2_i8s4 are already scaled in MCinst operand.
This patch changes the number of bits and scale factor to reflect that
state when checking stack offset status. AddrModeT2_i7s[2|4] also have
this particularity but since MVE instructions are not outlined, just move
these cases to the unhandled ones.

Differential Revision: https://reviews.llvm.org/D103167

(cherry picked from commit 6c78dbd4ca1f2c25cdc276d646c7920afe856ca3)
---
 llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp      | 22 ++++++++-----------
 .../machine-outliner-stack-fixup-thumb.mir    | 20 ++++++++---------
 2 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 112eb59e173d..e418d53b56a4 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -5934,6 +5934,9 @@ bool ARMBaseInstrInfo::checkAndUpdateStackOffset(MachineInstr *MI,
       || AddrMode == ARMII::AddrModeT2_so // SP can't be used as based register
       || AddrMode == ARMII::AddrModeT2_pc // PCrel access
       || AddrMode == ARMII::AddrMode2     // Used by PRE and POST indexed LD/ST
+      || AddrMode == ARMII::AddrModeT2_i7 // v8.1-M MVE
+      || AddrMode == ARMII::AddrModeT2_i7s2 // v8.1-M MVE
+      || AddrMode == ARMII::AddrModeT2_i7s4 // v8.1-M sys regs VLDR/VSTR
       || AddrMode == ARMII::AddrModeNone)
     return false;
 
@@ -5976,6 +5979,10 @@ bool ARMBaseInstrInfo::checkAndUpdateStackOffset(MachineInstr *MI,
     NumBits = 8;
     break;
   case ARMII::AddrModeT2_i8s4:
+    // FIXME: Values are already scaled in this addressing mode.
+    assert((Fixup & 3) == 0 && "Can't encode this offset!");
+    NumBits = 10;
+    break;
   case ARMII::AddrModeT2_ldrex:
     NumBits = 8;
     Scale = 4;
@@ -5984,17 +5991,6 @@ bool ARMBaseInstrInfo::checkAndUpdateStackOffset(MachineInstr *MI,
   case ARMII::AddrMode_i12:
     NumBits = 12;
     break;
-  case ARMII::AddrModeT2_i7:
-    NumBits = 7;
-    break;
-  case ARMII::AddrModeT2_i7s2:
-    NumBits = 7;
-    Scale = 2;
-    break;
-  case ARMII::AddrModeT2_i7s4:
-    NumBits = 7;
-    Scale = 4;
-    break;
   case ARMII::AddrModeT1_s: // SP-relative LD/ST
     NumBits = 8;
     Scale = 4;
@@ -6004,8 +6000,8 @@ bool ARMBaseInstrInfo::checkAndUpdateStackOffset(MachineInstr *MI,
   }
   // Make sure the offset is encodable for instructions that scale the
   // immediate.
-  if (((OffVal * Scale + Fixup) & (Scale - 1)) != 0)
-    return false;
+  assert(((OffVal * Scale + Fixup) & (Scale - 1)) == 0 &&
+         "Can't encode this offset!");
   OffVal += Fixup / Scale;
 
   unsigned Mask = (1 << NumBits) - 1;
diff --git a/llvm/test/CodeGen/ARM/machine-outliner-stack-fixup-thumb.mir b/llvm/test/CodeGen/ARM/machine-outliner-stack-fixup-thumb.mir
index 7d9b19553b08..6c940f15eba6 100644
--- a/llvm/test/CodeGen/ARM/machine-outliner-stack-fixup-thumb.mir
+++ b/llvm/test/CodeGen/ARM/machine-outliner-stack-fixup-thumb.mir
@@ -81,23 +81,23 @@ body:             |
     ;CHECK-LABEL: name:           CheckAddrModeT2_i8s4
     ;CHECK: $r0 = tMOVr $r1, 14 /* CC::al */, $noreg
     ;CHECK-NEXT: tBL 14 /* CC::al */, $noreg, @OUTLINED_FUNCTION_[[I8S4:[0-9]+]]
-    ;CHECK-NEXT: t2STRDi8 $r0, $r1, $sp, 254, 14 /* CC::al */, $noreg
+    ;CHECK-NEXT: t2STRDi8 $r0, $r1, $sp, 1020, 14 /* CC::al */, $noreg
     $r0 = tMOVr $r1, 14, $noreg
     tBL 14, $noreg, @foo, implicit-def dead $lr, implicit $sp
     t2STRDi8 $r0, $r1, $sp, 0, 14, $noreg
     t2STRDi8 $r0, $r1, $sp, 8, 14, $noreg
-    t2STRDi8 $r0, $r1, $sp, 253, 14, $noreg
-    t2STRDi8 $r0, $r1, $sp, 254, 14, $noreg
+    t2STRDi8 $r0, $r1, $sp, 1012, 14, $noreg
+    t2STRDi8 $r0, $r1, $sp, 1020, 14, $noreg
     tBL 14, $noreg, @foo, implicit-def dead $lr, implicit $sp
     t2STRDi8 $r0, $r1, $sp, 0, 14, $noreg
     t2STRDi8 $r0, $r1, $sp, 8, 14, $noreg
-    t2STRDi8 $r0, $r1, $sp, 253, 14, $noreg
-    t2STRDi8 $r0, $r1, $sp, 254, 14, $noreg
+    t2STRDi8 $r0, $r1, $sp, 1012, 14, $noreg
+    t2STRDi8 $r0, $r1, $sp, 1020, 14, $noreg
     tBL 14, $noreg, @foo, implicit-def dead $lr, implicit $sp
     t2STRDi8 $r0, $r1, $sp, 0, 14, $noreg
     t2STRDi8 $r0, $r1, $sp, 8, 14, $noreg
-    t2STRDi8 $r0, $r1, $sp, 253, 14, $noreg
-    t2STRDi8 $r0, $r1, $sp, 254, 14, $noreg
+    t2STRDi8 $r0, $r1, $sp, 1012, 14, $noreg
+    t2STRDi8 $r0, $r1, $sp, 1020, 14, $noreg
     BX_RET 14, $noreg
 ...
 ---
@@ -205,9 +205,9 @@ body:             |
     ;CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 8
     ;CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $lr, -8
     ;CHECK-NEXT: tBL 14 /* CC::al */, $noreg, @foo, implicit-def dead $lr, implicit $sp
-    ;CHECK-NEXT: t2STRDi8 $r0, $r1, $sp, 2, 14 /* CC::al */, $noreg
-    ;CHECK-NEXT: t2STRDi8 $r0, $r1, $sp, 10, 14 /* CC::al */, $noreg
-    ;CHECK-NEXT: t2STRDi8 $r0, $r1, $sp, 255, 14 /* CC::al */, $noreg
+    ;CHECK-NEXT: t2STRDi8 $r0, $r1, $sp, 8, 14 /* CC::al */, $noreg
+    ;CHECK-NEXT: t2STRDi8 $r0, $r1, $sp, 16, 14 /* CC::al */, $noreg
+    ;CHECK-NEXT: t2STRDi8 $r0, $r1, $sp, 1020, 14 /* CC::al */, $noreg
     ;CHECK-NEXT: $lr, $sp = t2LDR_POST $sp, 8, 14 /* CC::al */, $noreg
 
     ;CHECK: name:           OUTLINED_FUNCTION_[[I12]]

From a37dff2bbf68b1a98ce6c48c6e4e5b32f7df0d9c Mon Sep 17 00:00:00 2001
From: Kai Luo <lkail@cn.ibm.com>
Date: Wed, 9 Jun 2021 02:22:48 +0000
Subject: [PATCH 279/318] [PowerPC][Dwarf] Assign MMA register's dwarf register
 number to negative value

According to ELF V2 ABI, `0` should be the dwarf number of `r0`. Currently MMA's register also uses `0` as its dwarf number, this confuses `RegisterInfoEmitter` and generates wrong dwarf -> llvm mapping.
```
extern const MCRegisterInfo::DwarfLLVMRegPair PPCDwarfFlavour1Dwarf2L[] = {
  { 0U, PPC::VSRp31 },
```
This leads to wrong cfi output in https://reviews.llvm.org/D100290.

Reviewed By: jsji

Differential Revision: https://reviews.llvm.org/D103761

(cherry picked from commit c87c294397ea4c3dae31f5a7fd6e38602338fd57)
---
 llvm/lib/Target/PowerPC/PPCRegisterInfo.td | 36 +++++++++++-----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
index 45d60369018b..551735c85b51 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -173,7 +173,7 @@ let SubRegIndices = [sub_vsx0, sub_vsx1] in {
   foreach Index = { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 } in {
     def VSRp#!srl(Index, 1) : VSRPair<!srl(Index, 1), "vsp"#Index,
                                       [!cast<VSRL>("VSL"#Index), !cast<VSRL>("VSL"#!add(Index, 1))]>,
-                              DwarfRegNum<[0, 0]>;
+                              DwarfRegNum<[-1, -1]>;
   }
 
   // VSR pairs 16 - 31 (corresponding to VSRs 32 - 62 paired with 33 - 63).
@@ -181,7 +181,7 @@ let SubRegIndices = [sub_vsx0, sub_vsx1] in {
     def VSRp#!add(!srl(Index, 1), 16) :
       VSRPair<!add(!srl(Index, 1), 16), "vsp"#!add(Index, 32),
               [!cast<VR>("V"#Index), !cast<VR>("V"#!add(Index, 1))]>,
-      DwarfRegNum<[0, 0]>;
+      DwarfRegNum<[-1, -1]>;
   }
 }
 
@@ -422,14 +422,14 @@ def CARRYRC : RegisterClass<"PPC", [i32], 32, (add CARRY, XER)> {
 }
 
 let SubRegIndices = [sub_pair0, sub_pair1] in {
-  def ACC0 : ACC<0, "acc0", [VSRp0, VSRp1]>, DwarfRegNum<[0, 0]>;
-  def ACC1 : ACC<1, "acc1", [VSRp2, VSRp3]>, DwarfRegNum<[0, 0]>;
-  def ACC2 : ACC<2, "acc2", [VSRp4, VSRp5]>, DwarfRegNum<[0, 0]>;
-  def ACC3 : ACC<3, "acc3", [VSRp6, VSRp7]>, DwarfRegNum<[0, 0]>;
-  def ACC4 : ACC<4, "acc4", [VSRp8, VSRp9]>, DwarfRegNum<[0, 0]>;
-  def ACC5 : ACC<5, "acc5", [VSRp10, VSRp11]>, DwarfRegNum<[0, 0]>;
-  def ACC6 : ACC<6, "acc6", [VSRp12, VSRp13]>, DwarfRegNum<[0, 0]>;
-  def ACC7 : ACC<7, "acc7", [VSRp14, VSRp15]>, DwarfRegNum<[0, 0]>;
+  def ACC0 : ACC<0, "acc0", [VSRp0, VSRp1]>, DwarfRegNum<[-1, -1]>;
+  def ACC1 : ACC<1, "acc1", [VSRp2, VSRp3]>, DwarfRegNum<[-1, -1]>;
+  def ACC2 : ACC<2, "acc2", [VSRp4, VSRp5]>, DwarfRegNum<[-1, -1]>;
+  def ACC3 : ACC<3, "acc3", [VSRp6, VSRp7]>, DwarfRegNum<[-1, -1]>;
+  def ACC4 : ACC<4, "acc4", [VSRp8, VSRp9]>, DwarfRegNum<[-1, -1]>;
+  def ACC5 : ACC<5, "acc5", [VSRp10, VSRp11]>, DwarfRegNum<[-1, -1]>;
+  def ACC6 : ACC<6, "acc6", [VSRp12, VSRp13]>, DwarfRegNum<[-1, -1]>;
+  def ACC7 : ACC<7, "acc7", [VSRp14, VSRp15]>, DwarfRegNum<[-1, -1]>;
 }
 def ACCRC : RegisterClass<"PPC", [v512i1], 128, (add ACC0, ACC1, ACC2, ACC3,
                                                       ACC4, ACC5, ACC6, ACC7)> {
@@ -437,14 +437,14 @@ def ACCRC : RegisterClass<"PPC", [v512i1], 128, (add ACC0, ACC1, ACC2, ACC3,
 }
 
 let SubRegIndices = [sub_pair0, sub_pair1] in {
-  def UACC0 : UACC<0, "acc0", [VSRp0, VSRp1]>, DwarfRegNum<[0, 0]>;
-  def UACC1 : UACC<1, "acc1", [VSRp2, VSRp3]>, DwarfRegNum<[0, 0]>;
-  def UACC2 : UACC<2, "acc2", [VSRp4, VSRp5]>, DwarfRegNum<[0, 0]>;
-  def UACC3 : UACC<3, "acc3", [VSRp6, VSRp7]>, DwarfRegNum<[0, 0]>;
-  def UACC4 : UACC<4, "acc4", [VSRp8, VSRp9]>, DwarfRegNum<[0, 0]>;
-  def UACC5 : UACC<5, "acc5", [VSRp10, VSRp11]>, DwarfRegNum<[0, 0]>;
-  def UACC6 : UACC<6, "acc6", [VSRp12, VSRp13]>, DwarfRegNum<[0, 0]>;
-  def UACC7 : UACC<7, "acc7", [VSRp14, VSRp15]>, DwarfRegNum<[0, 0]>;
+  def UACC0 : UACC<0, "acc0", [VSRp0, VSRp1]>, DwarfRegNum<[-1, -1]>;
+  def UACC1 : UACC<1, "acc1", [VSRp2, VSRp3]>, DwarfRegNum<[-1, -1]>;
+  def UACC2 : UACC<2, "acc2", [VSRp4, VSRp5]>, DwarfRegNum<[-1, -1]>;
+  def UACC3 : UACC<3, "acc3", [VSRp6, VSRp7]>, DwarfRegNum<[-1, -1]>;
+  def UACC4 : UACC<4, "acc4", [VSRp8, VSRp9]>, DwarfRegNum<[-1, -1]>;
+  def UACC5 : UACC<5, "acc5", [VSRp10, VSRp11]>, DwarfRegNum<[-1, -1]>;
+  def UACC6 : UACC<6, "acc6", [VSRp12, VSRp13]>, DwarfRegNum<[-1, -1]>;
+  def UACC7 : UACC<7, "acc7", [VSRp14, VSRp15]>, DwarfRegNum<[-1, -1]>;
 }
 def UACCRC : RegisterClass<"PPC", [v512i1], 128,
                            (add UACC0, UACC1, UACC2, UACC3,

From 04a68288ded459c7e76135a9ee4b7e9d4bf4cdc2 Mon Sep 17 00:00:00 2001
From: Kai Luo <lkail@cn.ibm.com>
Date: Wed, 9 Jun 2021 06:24:14 +0000
Subject: [PATCH 280/318] [PowerPC] Make sure the first probe is full size or
 is the last probe when stack is realigned

When `-fstack-clash-protection` is enabled and stack has to be realigned, some parts of redzone is written prior the probe, so probe might overwrite content already written in redzone. To avoid it, we have to make sure the first probe is at full probe size or is the last probe so that we can skip redzone.

It also fixes violation of ABI under PPC where `r1` isn't updated atomically.

This fixes https://bugs.llvm.org/show_bug.cgi?id=49903.

Reviewed By: jsji

Differential Revision: https://reviews.llvm.org/D100290

(cherry picked from commit bf58600badb1138a501ad81b07298207a7a64b2a)
---
 llvm/lib/Target/PowerPC/PPCFrameLowering.cpp  | 365 +++++------
 llvm/test/CodeGen/PowerPC/pr46759.ll          |  46 +-
 .../PowerPC/stack-clash-prologue-nounwind.ll  | 222 +++----
 .../CodeGen/PowerPC/stack-clash-prologue.ll   | 577 +++++++++---------
 4 files changed, 596 insertions(+), 614 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index 50ce11b8374f..16536bf23deb 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -859,15 +859,15 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
     BuildMI(MBB, MBBI, dl,
             TII.get(isPPC64 ? PPC::PROBED_STACKALLOC_64
                             : PPC::PROBED_STACKALLOC_32))
-        .addDef(ScratchReg)
-        .addDef(TempReg) // TempReg stores the old sp.
+        .addDef(TempReg)
+        .addDef(ScratchReg) // ScratchReg stores the old sp.
         .addImm(NegFrameSize);
     // FIXME: HasSTUX is only read if HasRedZone is not set, in such case, we
     // update the ScratchReg to meet the assumption that ScratchReg contains
     // the NegFrameSize. This solution is rather tricky.
     if (!HasRedZone) {
       BuildMI(MBB, MBBI, dl, TII.get(PPC::SUBF), ScratchReg)
-          .addReg(TempReg)
+          .addReg(ScratchReg)
           .addReg(SPReg);
       HasSTUX = true;
     }
@@ -1187,7 +1187,6 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
 
 void PPCFrameLowering::inlineStackProbe(MachineFunction &MF,
                                         MachineBasicBlock &PrologMBB) const {
-  // TODO: Generate CFI instructions.
   bool isPPC64 = Subtarget.isPPC64();
   const PPCTargetLowering &TLI = *Subtarget.getTargetLowering();
   const PPCInstrInfo &TII = *Subtarget.getInstrInfo();
@@ -1219,6 +1218,7 @@ void PPCFrameLowering::inlineStackProbe(MachineFunction &MF,
   bool HasBP = RegInfo->hasBasePointer(MF);
   Register BPReg = RegInfo->getBaseRegister(MF);
   Align MaxAlign = MFI.getMaxAlign();
+  bool HasRedZone = Subtarget.isPPC64() || !Subtarget.isSVR4ABI();
   const MCInstrDesc &CopyInst = TII.get(isPPC64 ? PPC::OR8 : PPC::OR);
   // Subroutines to generate .cfi_* directives.
   auto buildDefCFAReg = [&](MachineBasicBlock &MBB,
@@ -1272,212 +1272,221 @@ void PPCFrameLowering::inlineStackProbe(MachineFunction &MF,
           .addReg(SPReg)
           .addReg(NegSizeReg);
   };
-  // Used to probe realignment gap [stackptr - (stackptr % align), stackptr)
-  // when HasBP && isPPC64. In such scenario, normally we have r0, r1, r12, r30
-  // available and r1 is already copied to r30 which is BPReg. So BPReg stores
-  // the value of stackptr.
-  // First we have to probe tail interval whose size is less than probesize,
-  // i.e., [stackptr - (stackptr % align) % probesize, stackptr). At this stage,
-  // ScratchReg stores the value of ((stackptr % align) % probesize). Then we
-  // probe each block sized probesize until stackptr meets
-  // (stackptr - (stackptr % align)). At this stage, ScratchReg is materialized
-  // as negprobesize. At both stages, TempReg stores the value of
-  // (stackptr - (stackptr % align)).
-  auto dynamicProbe = [&](MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator MBBI, Register ScratchReg,
-                          Register TempReg) {
-    assert(HasBP && isPPC64 && "Probe alignment part not available");
+  // Used to probe stack when realignment is required.
+  // Note that, according to ABI's requirement, *sp must always equals the
+  // value of back-chain pointer, only st(w|d)u(x) can be used to update sp.
+  // Following is pseudo code:
+  // final_sp = (sp & align) + negframesize;
+  // neg_gap = final_sp - sp;
+  // while (neg_gap < negprobesize) {
+  //   stdu fp, negprobesize(sp);
+  //   neg_gap -= negprobesize;
+  // }
+  // stdux fp, sp, neg_gap
+  //
+  // When HasBP & HasRedzone, back-chain pointer is already saved in BPReg
+  // before probe code, we don't need to save it, so we get one additional reg
+  // that can be used to materialize the probeside if needed to use xform.
+  // Otherwise, we can NOT materialize probeside, so we can only use Dform for
+  // now.
+  //
+  // The allocations are:
+  // if (HasBP && HasRedzone) {
+  //   r0: materialize the probesize if needed so that we can use xform.
+  //   r12: `neg_gap`
+  // } else {
+  //   r0: back-chain pointer
+  //   r12: `neg_gap`.
+  // }
+  auto probeRealignedStack = [&](MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI,
+                                 Register ScratchReg, Register TempReg) {
+    assert(HasBP && "The function is supposed to have base pointer when its "
+                    "stack is realigned.");
     assert(isPowerOf2_64(ProbeSize) && "Probe size should be power of 2");
-    // ScratchReg = stackptr % align
-    BuildMI(MBB, MBBI, DL, TII.get(PPC::RLDICL), ScratchReg)
-        .addReg(BPReg)
-        .addImm(0)
-        .addImm(64 - Log2(MaxAlign));
-    // TempReg = stackptr - (stackptr % align)
-    BuildMI(MBB, MBBI, DL, TII.get(PPC::SUBFC8), TempReg)
-        .addReg(ScratchReg)
-        .addReg(BPReg);
-    // ScratchReg = (stackptr % align) % probesize
-    BuildMI(MBB, MBBI, DL, TII.get(PPC::RLDICL), ScratchReg)
-        .addReg(ScratchReg)
-        .addImm(0)
-        .addImm(64 - Log2(ProbeSize));
+
+    // FIXME: We can eliminate this limitation if we get more infomation about
+    // which part of redzone are already used. Used redzone can be treated
+    // probed. But there might be `holes' in redzone probed, this could
+    // complicate the implementation.
+    assert(ProbeSize >= Subtarget.getRedZoneSize() &&
+           "Probe size should be larger or equal to the size of red-zone so "
+           "that red-zone is not clobbered by probing.");
+
+    Register &FinalStackPtr = TempReg;
+    // FIXME: We only support NegProbeSize materializable by DForm currently.
+    // When HasBP && HasRedzone, we can use xform if we have an additional idle
+    // register.
+    NegProbeSize = std::max(NegProbeSize, -((int64_t)1 << 15));
+    assert(isInt<16>(NegProbeSize) &&
+           "NegProbeSize should be materializable by DForm");
     Register CRReg = PPC::CR0;
-    // If (stackptr % align) % probesize == 0, we should not generate probe
-    // code. Layout of output assembly kinda like:
+    // Layout of output assembly kinda like:
     // bb.0:
     //   ...
-    //   cmpldi $scratchreg, 0
-    //   beq bb.2
-    // bb.1: # Probe tail interval
-    //   neg $scratchreg, $scratchreg
-    //   stdux $bpreg, r1, $scratchreg
+    //   sub $scratchreg, $finalsp, r1
+    //   cmpdi $scratchreg, <negprobesize>
+    //   bge bb.2
+    // bb.1:
+    //   stdu <backchain>, <negprobesize>(r1)
+    //   sub $scratchreg, $scratchreg, negprobesize
+    //   cmpdi $scratchreg, <negprobesize>
+    //   blt bb.1
     // bb.2:
-    //   <materialize negprobesize into $scratchreg>
-    //   cmpd r1, $tempreg
-    //   beq bb.4
-    // bb.3: # Loop to probe each block
-    //   stdux $bpreg, r1, $scratchreg
-    //   cmpd r1, $tempreg
-    //   bne bb.3
-    // bb.4:
-    //   ...
+    //   stdux <backchain>, r1, $scratchreg
     MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
-    MachineBasicBlock *ProbeResidualMBB = MF.CreateMachineBasicBlock(ProbedBB);
-    MF.insert(MBBInsertPoint, ProbeResidualMBB);
-    MachineBasicBlock *ProbeLoopPreHeaderMBB =
-        MF.CreateMachineBasicBlock(ProbedBB);
-    MF.insert(MBBInsertPoint, ProbeLoopPreHeaderMBB);
     MachineBasicBlock *ProbeLoopBodyMBB = MF.CreateMachineBasicBlock(ProbedBB);
     MF.insert(MBBInsertPoint, ProbeLoopBodyMBB);
     MachineBasicBlock *ProbeExitMBB = MF.CreateMachineBasicBlock(ProbedBB);
     MF.insert(MBBInsertPoint, ProbeExitMBB);
-    // bb.4
-    ProbeExitMBB->splice(ProbeExitMBB->end(), &MBB, MBBI, MBB.end());
-    ProbeExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+    // bb.2
+    {
+      Register BackChainPointer = HasRedZone ? BPReg : TempReg;
+      allocateAndProbe(*ProbeExitMBB, ProbeExitMBB->end(), 0, ScratchReg, false,
+                       BackChainPointer);
+      if (HasRedZone)
+        // PROBED_STACKALLOC_64 assumes Operand(1) stores the old sp, copy BPReg
+        // to TempReg to satisfy it.
+        BuildMI(*ProbeExitMBB, ProbeExitMBB->end(), DL, CopyInst, TempReg)
+            .addReg(BPReg)
+            .addReg(BPReg);
+      ProbeExitMBB->splice(ProbeExitMBB->end(), &MBB, MBBI, MBB.end());
+      ProbeExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+    }
     // bb.0
-    BuildMI(&MBB, DL, TII.get(PPC::CMPDI), CRReg).addReg(ScratchReg).addImm(0);
-    BuildMI(&MBB, DL, TII.get(PPC::BCC))
-        .addImm(PPC::PRED_EQ)
-        .addReg(CRReg)
-        .addMBB(ProbeLoopPreHeaderMBB);
-    MBB.addSuccessor(ProbeResidualMBB);
-    MBB.addSuccessor(ProbeLoopPreHeaderMBB);
+    {
+      BuildMI(&MBB, DL, TII.get(isPPC64 ? PPC::SUBF8 : PPC::SUBF), ScratchReg)
+          .addReg(SPReg)
+          .addReg(FinalStackPtr);
+      if (!HasRedZone)
+        BuildMI(&MBB, DL, CopyInst, TempReg).addReg(SPReg).addReg(SPReg);
+      BuildMI(&MBB, DL, TII.get(isPPC64 ? PPC::CMPDI : PPC::CMPWI), CRReg)
+          .addReg(ScratchReg)
+          .addImm(NegProbeSize);
+      BuildMI(&MBB, DL, TII.get(PPC::BCC))
+          .addImm(PPC::PRED_GE)
+          .addReg(CRReg)
+          .addMBB(ProbeExitMBB);
+      MBB.addSuccessor(ProbeLoopBodyMBB);
+      MBB.addSuccessor(ProbeExitMBB);
+    }
     // bb.1
-    BuildMI(ProbeResidualMBB, DL, TII.get(PPC::NEG8), ScratchReg)
-        .addReg(ScratchReg);
-    allocateAndProbe(*ProbeResidualMBB, ProbeResidualMBB->end(), 0, ScratchReg,
-                     false, BPReg);
-    ProbeResidualMBB->addSuccessor(ProbeLoopPreHeaderMBB);
-    // bb.2
-    MaterializeImm(*ProbeLoopPreHeaderMBB, ProbeLoopPreHeaderMBB->end(),
-                   NegProbeSize, ScratchReg);
-    BuildMI(ProbeLoopPreHeaderMBB, DL, TII.get(PPC::CMPD), CRReg)
-        .addReg(SPReg)
-        .addReg(TempReg);
-    BuildMI(ProbeLoopPreHeaderMBB, DL, TII.get(PPC::BCC))
-        .addImm(PPC::PRED_EQ)
-        .addReg(CRReg)
-        .addMBB(ProbeExitMBB);
-    ProbeLoopPreHeaderMBB->addSuccessor(ProbeLoopBodyMBB);
-    ProbeLoopPreHeaderMBB->addSuccessor(ProbeExitMBB);
-    // bb.3
-    allocateAndProbe(*ProbeLoopBodyMBB, ProbeLoopBodyMBB->end(), 0, ScratchReg,
-                     false, BPReg);
-    BuildMI(ProbeLoopBodyMBB, DL, TII.get(PPC::CMPD), CRReg)
-        .addReg(SPReg)
-        .addReg(TempReg);
-    BuildMI(ProbeLoopBodyMBB, DL, TII.get(PPC::BCC))
-        .addImm(PPC::PRED_NE)
-        .addReg(CRReg)
-        .addMBB(ProbeLoopBodyMBB);
-    ProbeLoopBodyMBB->addSuccessor(ProbeExitMBB);
-    ProbeLoopBodyMBB->addSuccessor(ProbeLoopBodyMBB);
+    {
+      Register BackChainPointer = HasRedZone ? BPReg : TempReg;
+      allocateAndProbe(*ProbeLoopBodyMBB, ProbeLoopBodyMBB->end(), NegProbeSize,
+                       0, true /*UseDForm*/, BackChainPointer);
+      BuildMI(ProbeLoopBodyMBB, DL, TII.get(isPPC64 ? PPC::ADDI8 : PPC::ADDI),
+              ScratchReg)
+          .addReg(ScratchReg)
+          .addImm(-NegProbeSize);
+      BuildMI(ProbeLoopBodyMBB, DL, TII.get(isPPC64 ? PPC::CMPDI : PPC::CMPWI),
+              CRReg)
+          .addReg(ScratchReg)
+          .addImm(NegProbeSize);
+      BuildMI(ProbeLoopBodyMBB, DL, TII.get(PPC::BCC))
+          .addImm(PPC::PRED_LT)
+          .addReg(CRReg)
+          .addMBB(ProbeLoopBodyMBB);
+      ProbeLoopBodyMBB->addSuccessor(ProbeExitMBB);
+      ProbeLoopBodyMBB->addSuccessor(ProbeLoopBodyMBB);
+    }
     // Update liveins.
-    recomputeLiveIns(*ProbeResidualMBB);
-    recomputeLiveIns(*ProbeLoopPreHeaderMBB);
     recomputeLiveIns(*ProbeLoopBodyMBB);
     recomputeLiveIns(*ProbeExitMBB);
     return ProbeExitMBB;
   };
   // For case HasBP && MaxAlign > 1, we have to realign the SP by performing
-  // SP = SP - SP % MaxAlign.
+  // SP = SP - SP % MaxAlign, thus make the probe more like dynamic probe since
+  // the offset subtracted from SP is determined by SP's runtime value.
   if (HasBP && MaxAlign > 1) {
-    // FIXME: Currently only probe the gap [stackptr & alignmask, stackptr) in
-    // 64-bit mode.
-    if (isPPC64) {
-      // Use BPReg to calculate CFA.
-      if (needsCFI)
-        buildDefCFA(*CurrentMBB, {MI}, BPReg, 0);
-      // Since we have SPReg copied to BPReg at the moment, FPReg can be used as
-      // TempReg.
-      Register TempReg = FPReg;
-      CurrentMBB = dynamicProbe(*CurrentMBB, {MI}, ScratchReg, TempReg);
-      // Copy BPReg to FPReg to meet the definition of PROBED_STACKALLOC_64.
-      BuildMI(*CurrentMBB, {MI}, DL, CopyInst, FPReg)
-          .addReg(BPReg)
-          .addReg(BPReg);
-    } else {
-      // Initialize current frame pointer.
-      BuildMI(*CurrentMBB, {MI}, DL, CopyInst, FPReg)
+    // Calculate final stack pointer.
+    if (isPPC64)
+      BuildMI(*CurrentMBB, {MI}, DL, TII.get(PPC::RLDICL), ScratchReg)
           .addReg(SPReg)
-          .addReg(SPReg);
-      // Use FPReg to calculate CFA.
-      if (needsCFI)
-        buildDefCFA(*CurrentMBB, {MI}, FPReg, 0);
+          .addImm(0)
+          .addImm(64 - Log2(MaxAlign));
+    else
       BuildMI(*CurrentMBB, {MI}, DL, TII.get(PPC::RLWINM), ScratchReg)
-          .addReg(FPReg)
+          .addReg(SPReg)
           .addImm(0)
           .addImm(32 - Log2(MaxAlign))
           .addImm(31);
-      BuildMI(*CurrentMBB, {MI}, DL, TII.get(PPC::SUBFC), SPReg)
-          .addReg(ScratchReg)
-          .addReg(SPReg);
-    }
+    BuildMI(*CurrentMBB, {MI}, DL, TII.get(isPPC64 ? PPC::SUBF8 : PPC::SUBF),
+            FPReg)
+        .addReg(ScratchReg)
+        .addReg(SPReg);
+    MaterializeImm(*CurrentMBB, {MI}, NegFrameSize, ScratchReg);
+    BuildMI(*CurrentMBB, {MI}, DL, TII.get(isPPC64 ? PPC::ADD8 : PPC::ADD4),
+            FPReg)
+        .addReg(ScratchReg)
+        .addReg(FPReg);
+    CurrentMBB = probeRealignedStack(*CurrentMBB, {MI}, ScratchReg, FPReg);
+    if (needsCFI)
+      buildDefCFAReg(*CurrentMBB, {MI}, FPReg);
   } else {
     // Initialize current frame pointer.
     BuildMI(*CurrentMBB, {MI}, DL, CopyInst, FPReg).addReg(SPReg).addReg(SPReg);
     // Use FPReg to calculate CFA.
     if (needsCFI)
       buildDefCFA(*CurrentMBB, {MI}, FPReg, 0);
-  }
-  // Probe residual part.
-  if (NegResidualSize) {
-    bool ResidualUseDForm = CanUseDForm(NegResidualSize);
-    if (!ResidualUseDForm)
-      MaterializeImm(*CurrentMBB, {MI}, NegResidualSize, ScratchReg);
-    allocateAndProbe(*CurrentMBB, {MI}, NegResidualSize, ScratchReg,
-                     ResidualUseDForm, FPReg);
-  }
-  bool UseDForm = CanUseDForm(NegProbeSize);
-  // If number of blocks is small, just probe them directly.
-  if (NumBlocks < 3) {
-    if (!UseDForm)
-      MaterializeImm(*CurrentMBB, {MI}, NegProbeSize, ScratchReg);
-    for (int i = 0; i < NumBlocks; ++i)
-      allocateAndProbe(*CurrentMBB, {MI}, NegProbeSize, ScratchReg, UseDForm,
-                       FPReg);
-    if (needsCFI) {
-      // Restore using SPReg to calculate CFA.
-      buildDefCFAReg(*CurrentMBB, {MI}, SPReg);
+    // Probe residual part.
+    if (NegResidualSize) {
+      bool ResidualUseDForm = CanUseDForm(NegResidualSize);
+      if (!ResidualUseDForm)
+        MaterializeImm(*CurrentMBB, {MI}, NegResidualSize, ScratchReg);
+      allocateAndProbe(*CurrentMBB, {MI}, NegResidualSize, ScratchReg,
+                       ResidualUseDForm, FPReg);
     }
-  } else {
-    // Since CTR is a volatile register and current shrinkwrap implementation
-    // won't choose an MBB in a loop as the PrologMBB, it's safe to synthesize a
-    // CTR loop to probe.
-    // Calculate trip count and stores it in CTRReg.
-    MaterializeImm(*CurrentMBB, {MI}, NumBlocks, ScratchReg);
-    BuildMI(*CurrentMBB, {MI}, DL, TII.get(isPPC64 ? PPC::MTCTR8 : PPC::MTCTR))
-        .addReg(ScratchReg, RegState::Kill);
-    if (!UseDForm)
-      MaterializeImm(*CurrentMBB, {MI}, NegProbeSize, ScratchReg);
-    // Create MBBs of the loop.
-    MachineFunction::iterator MBBInsertPoint =
-        std::next(CurrentMBB->getIterator());
-    MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(ProbedBB);
-    MF.insert(MBBInsertPoint, LoopMBB);
-    MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(ProbedBB);
-    MF.insert(MBBInsertPoint, ExitMBB);
-    // Synthesize the loop body.
-    allocateAndProbe(*LoopMBB, LoopMBB->end(), NegProbeSize, ScratchReg,
-                     UseDForm, FPReg);
-    BuildMI(LoopMBB, DL, TII.get(isPPC64 ? PPC::BDNZ8 : PPC::BDNZ))
-        .addMBB(LoopMBB);
-    LoopMBB->addSuccessor(ExitMBB);
-    LoopMBB->addSuccessor(LoopMBB);
-    // Synthesize the exit MBB.
-    ExitMBB->splice(ExitMBB->end(), CurrentMBB,
-                    std::next(MachineBasicBlock::iterator(MI)),
-                    CurrentMBB->end());
-    ExitMBB->transferSuccessorsAndUpdatePHIs(CurrentMBB);
-    CurrentMBB->addSuccessor(LoopMBB);
-    if (needsCFI) {
-      // Restore using SPReg to calculate CFA.
-      buildDefCFAReg(*ExitMBB, ExitMBB->begin(), SPReg);
+    bool UseDForm = CanUseDForm(NegProbeSize);
+    // If number of blocks is small, just probe them directly.
+    if (NumBlocks < 3) {
+      if (!UseDForm)
+        MaterializeImm(*CurrentMBB, {MI}, NegProbeSize, ScratchReg);
+      for (int i = 0; i < NumBlocks; ++i)
+        allocateAndProbe(*CurrentMBB, {MI}, NegProbeSize, ScratchReg, UseDForm,
+                         FPReg);
+      if (needsCFI) {
+        // Restore using SPReg to calculate CFA.
+        buildDefCFAReg(*CurrentMBB, {MI}, SPReg);
+      }
+    } else {
+      // Since CTR is a volatile register and current shrinkwrap implementation
+      // won't choose an MBB in a loop as the PrologMBB, it's safe to synthesize a
+      // CTR loop to probe.
+      // Calculate trip count and stores it in CTRReg.
+      MaterializeImm(*CurrentMBB, {MI}, NumBlocks, ScratchReg);
+      BuildMI(*CurrentMBB, {MI}, DL, TII.get(isPPC64 ? PPC::MTCTR8 : PPC::MTCTR))
+          .addReg(ScratchReg, RegState::Kill);
+      if (!UseDForm)
+        MaterializeImm(*CurrentMBB, {MI}, NegProbeSize, ScratchReg);
+      // Create MBBs of the loop.
+      MachineFunction::iterator MBBInsertPoint =
+          std::next(CurrentMBB->getIterator());
+      MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(ProbedBB);
+      MF.insert(MBBInsertPoint, LoopMBB);
+      MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(ProbedBB);
+      MF.insert(MBBInsertPoint, ExitMBB);
+      // Synthesize the loop body.
+      allocateAndProbe(*LoopMBB, LoopMBB->end(), NegProbeSize, ScratchReg,
+                       UseDForm, FPReg);
+      BuildMI(LoopMBB, DL, TII.get(isPPC64 ? PPC::BDNZ8 : PPC::BDNZ))
+          .addMBB(LoopMBB);
+      LoopMBB->addSuccessor(ExitMBB);
+      LoopMBB->addSuccessor(LoopMBB);
+      // Synthesize the exit MBB.
+      ExitMBB->splice(ExitMBB->end(), CurrentMBB,
+                      std::next(MachineBasicBlock::iterator(MI)),
+                      CurrentMBB->end());
+      ExitMBB->transferSuccessorsAndUpdatePHIs(CurrentMBB);
+      CurrentMBB->addSuccessor(LoopMBB);
+      if (needsCFI) {
+        // Restore using SPReg to calculate CFA.
+        buildDefCFAReg(*ExitMBB, ExitMBB->begin(), SPReg);
+      }
+      // Update liveins.
+      recomputeLiveIns(*LoopMBB);
+      recomputeLiveIns(*ExitMBB);
     }
-    // Update liveins.
-    recomputeLiveIns(*LoopMBB);
-    recomputeLiveIns(*ExitMBB);
   }
   ++NumPrologProbed;
   MI.eraseFromParent();
diff --git a/llvm/test/CodeGen/PowerPC/pr46759.ll b/llvm/test/CodeGen/PowerPC/pr46759.ll
index 33b44b720b6e..d6d02921efca 100644
--- a/llvm/test/CodeGen/PowerPC/pr46759.ll
+++ b/llvm/test/CodeGen/PowerPC/pr46759.ll
@@ -6,32 +6,26 @@
 define void @foo(i32 %vla_size) #0 {
 ; CHECK-LE-LABEL: foo:
 ; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    clrldi r12, r1, 53
 ; CHECK-LE-NEXT:    std r31, -8(r1)
 ; CHECK-LE-NEXT:    std r30, -16(r1)
 ; CHECK-LE-NEXT:    mr r30, r1
-; CHECK-LE-NEXT:    .cfi_def_cfa r30, 0
-; CHECK-LE-NEXT:    clrldi r0, r30, 53
-; CHECK-LE-NEXT:    subc r12, r30, r0
-; CHECK-LE-NEXT:    clrldi r0, r0, 52
-; CHECK-LE-NEXT:    cmpdi r0, 0
-; CHECK-LE-NEXT:    beq cr0, .LBB0_2
-; CHECK-LE-NEXT:  # %bb.1: # %entry
-; CHECK-LE-NEXT:    neg r0, r0
-; CHECK-LE-NEXT:    stdux r30, r1, r0
-; CHECK-LE-NEXT:  .LBB0_2: # %entry
-; CHECK-LE-NEXT:    li r0, -4096
-; CHECK-LE-NEXT:    cmpd r1, r12
-; CHECK-LE-NEXT:    beq cr0, .LBB0_4
-; CHECK-LE-NEXT:  .LBB0_3: # %entry
+; CHECK-LE-NEXT:    sub r0, r1, r12
+; CHECK-LE-NEXT:    li r12, -6144
+; CHECK-LE-NEXT:    add r0, r12, r0
+; CHECK-LE-NEXT:    sub r12, r0, r1
+; CHECK-LE-NEXT:    cmpdi r12, -4096
+; CHECK-LE-NEXT:    bge cr0, .LBB0_2
+; CHECK-LE-NEXT:  .LBB0_1: # %entry
 ; CHECK-LE-NEXT:    #
-; CHECK-LE-NEXT:    stdux r30, r1, r0
-; CHECK-LE-NEXT:    cmpd r1, r12
-; CHECK-LE-NEXT:    bne cr0, .LBB0_3
-; CHECK-LE-NEXT:  .LBB0_4: # %entry
-; CHECK-LE-NEXT:    mr r12, r30
-; CHECK-LE-NEXT:    stdu r12, -2048(r1)
-; CHECK-LE-NEXT:    stdu r12, -4096(r1)
-; CHECK-LE-NEXT:    .cfi_def_cfa_register r1
+; CHECK-LE-NEXT:    stdu r30, -4096(r1)
+; CHECK-LE-NEXT:    addi r12, r12, 4096
+; CHECK-LE-NEXT:    cmpdi r12, -4096
+; CHECK-LE-NEXT:    blt cr0, .LBB0_1
+; CHECK-LE-NEXT:  .LBB0_2: # %entry
+; CHECK-LE-NEXT:    stdux r30, r1, r12
+; CHECK-LE-NEXT:    mr r0, r30
+; CHECK-LE-NEXT:    .cfi_def_cfa_register r0
 ; CHECK-LE-NEXT:    .cfi_def_cfa_register r30
 ; CHECK-LE-NEXT:    .cfi_offset r31, -8
 ; CHECK-LE-NEXT:    .cfi_offset r30, -16
@@ -52,13 +46,13 @@ define void @foo(i32 %vla_size) #0 {
 ; CHECK-LE-NEXT:    add r4, r1, r4
 ; CHECK-LE-NEXT:    stdux r3, r1, r5
 ; CHECK-LE-NEXT:    cmpd r1, r4
-; CHECK-LE-NEXT:    beq cr0, .LBB0_6
-; CHECK-LE-NEXT:  .LBB0_5: # %entry
+; CHECK-LE-NEXT:    beq cr0, .LBB0_4
+; CHECK-LE-NEXT:  .LBB0_3: # %entry
 ; CHECK-LE-NEXT:    #
 ; CHECK-LE-NEXT:    stdu r3, -4096(r1)
 ; CHECK-LE-NEXT:    cmpd r1, r4
-; CHECK-LE-NEXT:    bne cr0, .LBB0_5
-; CHECK-LE-NEXT:  .LBB0_6: # %entry
+; CHECK-LE-NEXT:    bne cr0, .LBB0_3
+; CHECK-LE-NEXT:  .LBB0_4: # %entry
 ; CHECK-LE-NEXT:    addi r3, r1, 2048
 ; CHECK-LE-NEXT:    lbz r3, 0(r3)
 ; CHECK-LE-NEXT:    mr r1, r30
diff --git a/llvm/test/CodeGen/PowerPC/stack-clash-prologue-nounwind.ll b/llvm/test/CodeGen/PowerPC/stack-clash-prologue-nounwind.ll
index e595d8a732a5..4a8de768d82a 100644
--- a/llvm/test/CodeGen/PowerPC/stack-clash-prologue-nounwind.ll
+++ b/llvm/test/CodeGen/PowerPC/stack-clash-prologue-nounwind.ll
@@ -44,12 +44,12 @@ entry:
 define i8 @f1() #0 "stack-probe-size"="0" nounwind {
 ; CHECK-LE-LABEL: f1:
 ; CHECK-LE:       # %bb.0: # %entry
-; CHECK-LE-NEXT:    mr r12, r1
-; CHECK-LE-NEXT:    li r0, 259
-; CHECK-LE-NEXT:    mtctr r0
+; CHECK-LE-NEXT:    mr r0, r1
+; CHECK-LE-NEXT:    li r12, 259
+; CHECK-LE-NEXT:    mtctr r12
 ; CHECK-LE-NEXT:  .LBB1_1: # %entry
 ; CHECK-LE-NEXT:    #
-; CHECK-LE-NEXT:    stdu r12, -16(r1)
+; CHECK-LE-NEXT:    stdu r0, -16(r1)
 ; CHECK-LE-NEXT:    bdnz .LBB1_1
 ; CHECK-LE-NEXT:  # %bb.2: # %entry
 ; CHECK-LE-NEXT:    li r3, 3
@@ -60,12 +60,12 @@ define i8 @f1() #0 "stack-probe-size"="0" nounwind {
 ;
 ; CHECK-BE-LABEL: f1:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    mr r12, r1
-; CHECK-BE-NEXT:    li r0, 260
-; CHECK-BE-NEXT:    mtctr r0
+; CHECK-BE-NEXT:    mr r0, r1
+; CHECK-BE-NEXT:    li r12, 260
+; CHECK-BE-NEXT:    mtctr r12
 ; CHECK-BE-NEXT:  .LBB1_1: # %entry
 ; CHECK-BE-NEXT:    #
-; CHECK-BE-NEXT:    stdu r12, -16(r1)
+; CHECK-BE-NEXT:    stdu r0, -16(r1)
 ; CHECK-BE-NEXT:    bdnz .LBB1_1
 ; CHECK-BE-NEXT:  # %bb.2: # %entry
 ; CHECK-BE-NEXT:    li r3, 3
@@ -76,16 +76,16 @@ define i8 @f1() #0 "stack-probe-size"="0" nounwind {
 ;
 ; CHECK-32-LABEL: f1:
 ; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    mr r12, r1
-; CHECK-32-NEXT:    li r0, 257
-; CHECK-32-NEXT:    mtctr r0
+; CHECK-32-NEXT:    mr r0, r1
+; CHECK-32-NEXT:    li r12, 257
+; CHECK-32-NEXT:    mtctr r12
 ; CHECK-32-NEXT:  .LBB1_1: # %entry
 ; CHECK-32-NEXT:    #
-; CHECK-32-NEXT:    stwu r12, -16(r1)
+; CHECK-32-NEXT:    stwu r0, -16(r1)
 ; CHECK-32-NEXT:    bdnz .LBB1_1
 ; CHECK-32-NEXT:  # %bb.2: # %entry
 ; CHECK-32-NEXT:    li r3, 3
-; CHECK-32-NEXT:    sub r0, r1, r12
+; CHECK-32-NEXT:    sub r0, r1, r0
 ; CHECK-32-NEXT:    stb r3, 16(r1)
 ; CHECK-32-NEXT:    sub r0, r1, r0
 ; CHECK-32-NEXT:    lbz r3, 16(r1)
@@ -102,13 +102,13 @@ entry:
 define i8 @f2() #0 nounwind {
 ; CHECK-LE-LABEL: f2:
 ; CHECK-LE:       # %bb.0: # %entry
-; CHECK-LE-NEXT:    mr r12, r1
-; CHECK-LE-NEXT:    stdu r12, -48(r1)
-; CHECK-LE-NEXT:    li r0, 16
-; CHECK-LE-NEXT:    mtctr r0
+; CHECK-LE-NEXT:    mr r0, r1
+; CHECK-LE-NEXT:    stdu r0, -48(r1)
+; CHECK-LE-NEXT:    li r12, 16
+; CHECK-LE-NEXT:    mtctr r12
 ; CHECK-LE-NEXT:  .LBB2_1: # %entry
 ; CHECK-LE-NEXT:    #
-; CHECK-LE-NEXT:    stdu r12, -4096(r1)
+; CHECK-LE-NEXT:    stdu r0, -4096(r1)
 ; CHECK-LE-NEXT:    bdnz .LBB2_1
 ; CHECK-LE-NEXT:  # %bb.2: # %entry
 ; CHECK-LE-NEXT:    li r3, 3
@@ -119,13 +119,13 @@ define i8 @f2() #0 nounwind {
 ;
 ; CHECK-BE-LABEL: f2:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    mr r12, r1
-; CHECK-BE-NEXT:    stdu r12, -64(r1)
-; CHECK-BE-NEXT:    li r0, 16
-; CHECK-BE-NEXT:    mtctr r0
+; CHECK-BE-NEXT:    mr r0, r1
+; CHECK-BE-NEXT:    stdu r0, -64(r1)
+; CHECK-BE-NEXT:    li r12, 16
+; CHECK-BE-NEXT:    mtctr r12
 ; CHECK-BE-NEXT:  .LBB2_1: # %entry
 ; CHECK-BE-NEXT:    #
-; CHECK-BE-NEXT:    stdu r12, -4096(r1)
+; CHECK-BE-NEXT:    stdu r0, -4096(r1)
 ; CHECK-BE-NEXT:    bdnz .LBB2_1
 ; CHECK-BE-NEXT:  # %bb.2: # %entry
 ; CHECK-BE-NEXT:    li r3, 3
@@ -136,16 +136,16 @@ define i8 @f2() #0 nounwind {
 ;
 ; CHECK-32-LABEL: f2:
 ; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    mr r12, r1
-; CHECK-32-NEXT:    stwu r12, -16(r1)
-; CHECK-32-NEXT:    li r0, 16
-; CHECK-32-NEXT:    mtctr r0
+; CHECK-32-NEXT:    mr r0, r1
+; CHECK-32-NEXT:    stwu r0, -16(r1)
+; CHECK-32-NEXT:    li r12, 16
+; CHECK-32-NEXT:    mtctr r12
 ; CHECK-32-NEXT:  .LBB2_1: # %entry
 ; CHECK-32-NEXT:    #
-; CHECK-32-NEXT:    stwu r12, -4096(r1)
+; CHECK-32-NEXT:    stwu r0, -4096(r1)
 ; CHECK-32-NEXT:    bdnz .LBB2_1
 ; CHECK-32-NEXT:  # %bb.2: # %entry
-; CHECK-32-NEXT:    sub r0, r1, r12
+; CHECK-32-NEXT:    sub r0, r1, r0
 ; CHECK-32-NEXT:    li r3, 3
 ; CHECK-32-NEXT:    sub r0, r1, r0
 ; CHECK-32-NEXT:    stb r3, 16(r1)
@@ -166,10 +166,10 @@ entry:
 define i8 @f3() #0 "stack-probe-size"="32768" nounwind {
 ; CHECK-LE-LABEL: f3:
 ; CHECK-LE:       # %bb.0: # %entry
-; CHECK-LE-NEXT:    mr r12, r1
-; CHECK-LE-NEXT:    stdu r12, -48(r1)
-; CHECK-LE-NEXT:    stdu r12, -32768(r1)
-; CHECK-LE-NEXT:    stdu r12, -32768(r1)
+; CHECK-LE-NEXT:    mr r0, r1
+; CHECK-LE-NEXT:    stdu r0, -48(r1)
+; CHECK-LE-NEXT:    stdu r0, -32768(r1)
+; CHECK-LE-NEXT:    stdu r0, -32768(r1)
 ; CHECK-LE-NEXT:    li r3, 3
 ; CHECK-LE-NEXT:    stb r3, 48(r1)
 ; CHECK-LE-NEXT:    lbz r3, 48(r1)
@@ -178,10 +178,10 @@ define i8 @f3() #0 "stack-probe-size"="32768" nounwind {
 ;
 ; CHECK-BE-LABEL: f3:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    mr r12, r1
-; CHECK-BE-NEXT:    stdu r12, -64(r1)
-; CHECK-BE-NEXT:    stdu r12, -32768(r1)
-; CHECK-BE-NEXT:    stdu r12, -32768(r1)
+; CHECK-BE-NEXT:    mr r0, r1
+; CHECK-BE-NEXT:    stdu r0, -64(r1)
+; CHECK-BE-NEXT:    stdu r0, -32768(r1)
+; CHECK-BE-NEXT:    stdu r0, -32768(r1)
 ; CHECK-BE-NEXT:    li r3, 3
 ; CHECK-BE-NEXT:    stb r3, 64(r1)
 ; CHECK-BE-NEXT:    lbz r3, 64(r1)
@@ -190,11 +190,11 @@ define i8 @f3() #0 "stack-probe-size"="32768" nounwind {
 ;
 ; CHECK-32-LABEL: f3:
 ; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    mr r12, r1
-; CHECK-32-NEXT:    stwu r12, -16(r1)
-; CHECK-32-NEXT:    stwu r12, -32768(r1)
-; CHECK-32-NEXT:    stwu r12, -32768(r1)
-; CHECK-32-NEXT:    sub r0, r1, r12
+; CHECK-32-NEXT:    mr r0, r1
+; CHECK-32-NEXT:    stwu r0, -16(r1)
+; CHECK-32-NEXT:    stwu r0, -32768(r1)
+; CHECK-32-NEXT:    stwu r0, -32768(r1)
+; CHECK-32-NEXT:    sub r0, r1, r0
 ; CHECK-32-NEXT:    li r3, 3
 ; CHECK-32-NEXT:    sub r0, r1, r0
 ; CHECK-32-NEXT:    stb r3, 16(r1)
@@ -261,15 +261,15 @@ entry:
 define i8 @f5() #0 "stack-probe-size"="65536" nounwind {
 ; CHECK-LE-LABEL: f5:
 ; CHECK-LE:       # %bb.0: # %entry
-; CHECK-LE-NEXT:    mr r12, r1
-; CHECK-LE-NEXT:    stdu r12, -48(r1)
-; CHECK-LE-NEXT:    li r0, 16
-; CHECK-LE-NEXT:    mtctr r0
-; CHECK-LE-NEXT:    lis r0, -1
-; CHECK-LE-NEXT:    nop
+; CHECK-LE-NEXT:    mr r0, r1
+; CHECK-LE-NEXT:    stdu r0, -48(r1)
+; CHECK-LE-NEXT:    li r12, 16
+; CHECK-LE-NEXT:    mtctr r12
+; CHECK-LE-NEXT:    lis r12, -1
+; CHECK-LE-NEXT:    ori r12, r12, 0
 ; CHECK-LE-NEXT:  .LBB5_1: # %entry
 ; CHECK-LE-NEXT:    #
-; CHECK-LE-NEXT:    stdux r12, r1, r0
+; CHECK-LE-NEXT:    stdux r0, r1, r12
 ; CHECK-LE-NEXT:    bdnz .LBB5_1
 ; CHECK-LE-NEXT:  # %bb.2: # %entry
 ; CHECK-LE-NEXT:    li r3, 3
@@ -280,15 +280,15 @@ define i8 @f5() #0 "stack-probe-size"="65536" nounwind {
 ;
 ; CHECK-BE-LABEL: f5:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    mr r12, r1
-; CHECK-BE-NEXT:    stdu r12, -64(r1)
-; CHECK-BE-NEXT:    li r0, 16
-; CHECK-BE-NEXT:    mtctr r0
-; CHECK-BE-NEXT:    lis r0, -1
-; CHECK-BE-NEXT:    nop
+; CHECK-BE-NEXT:    mr r0, r1
+; CHECK-BE-NEXT:    stdu r0, -64(r1)
+; CHECK-BE-NEXT:    li r12, 16
+; CHECK-BE-NEXT:    mtctr r12
+; CHECK-BE-NEXT:    lis r12, -1
+; CHECK-BE-NEXT:    ori r12, r12, 0
 ; CHECK-BE-NEXT:  .LBB5_1: # %entry
 ; CHECK-BE-NEXT:    #
-; CHECK-BE-NEXT:    stdux r12, r1, r0
+; CHECK-BE-NEXT:    stdux r0, r1, r12
 ; CHECK-BE-NEXT:    bdnz .LBB5_1
 ; CHECK-BE-NEXT:  # %bb.2: # %entry
 ; CHECK-BE-NEXT:    li r3, 3
@@ -299,18 +299,18 @@ define i8 @f5() #0 "stack-probe-size"="65536" nounwind {
 ;
 ; CHECK-32-LABEL: f5:
 ; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    mr r12, r1
-; CHECK-32-NEXT:    stwu r12, -16(r1)
-; CHECK-32-NEXT:    li r0, 16
-; CHECK-32-NEXT:    mtctr r0
-; CHECK-32-NEXT:    lis r0, -1
-; CHECK-32-NEXT:    nop
+; CHECK-32-NEXT:    mr r0, r1
+; CHECK-32-NEXT:    stwu r0, -16(r1)
+; CHECK-32-NEXT:    li r12, 16
+; CHECK-32-NEXT:    mtctr r12
+; CHECK-32-NEXT:    lis r12, -1
+; CHECK-32-NEXT:    ori r12, r12, 0
 ; CHECK-32-NEXT:  .LBB5_1: # %entry
 ; CHECK-32-NEXT:    #
-; CHECK-32-NEXT:    stwux r12, r1, r0
+; CHECK-32-NEXT:    stwux r0, r1, r12
 ; CHECK-32-NEXT:    bdnz .LBB5_1
 ; CHECK-32-NEXT:  # %bb.2: # %entry
-; CHECK-32-NEXT:    sub r0, r1, r12
+; CHECK-32-NEXT:    sub r0, r1, r0
 ; CHECK-32-NEXT:    li r3, 3
 ; CHECK-32-NEXT:    sub r0, r1, r0
 ; CHECK-32-NEXT:    stb r3, 16(r1)
@@ -331,14 +331,14 @@ entry:
 define i8 @f6() #0 nounwind {
 ; CHECK-LE-LABEL: f6:
 ; CHECK-LE:       # %bb.0: # %entry
-; CHECK-LE-NEXT:    mr r12, r1
-; CHECK-LE-NEXT:    stdu r12, -48(r1)
-; CHECK-LE-NEXT:    lis r0, 4
-; CHECK-LE-NEXT:    nop
-; CHECK-LE-NEXT:    mtctr r0
+; CHECK-LE-NEXT:    mr r0, r1
+; CHECK-LE-NEXT:    stdu r0, -48(r1)
+; CHECK-LE-NEXT:    lis r12, 4
+; CHECK-LE-NEXT:    ori r12, r12, 0
+; CHECK-LE-NEXT:    mtctr r12
 ; CHECK-LE-NEXT:  .LBB6_1: # %entry
 ; CHECK-LE-NEXT:    #
-; CHECK-LE-NEXT:    stdu r12, -4096(r1)
+; CHECK-LE-NEXT:    stdu r0, -4096(r1)
 ; CHECK-LE-NEXT:    bdnz .LBB6_1
 ; CHECK-LE-NEXT:  # %bb.2: # %entry
 ; CHECK-LE-NEXT:    li r3, 3
@@ -349,14 +349,14 @@ define i8 @f6() #0 nounwind {
 ;
 ; CHECK-BE-LABEL: f6:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    mr r12, r1
-; CHECK-BE-NEXT:    stdu r12, -64(r1)
-; CHECK-BE-NEXT:    lis r0, 4
-; CHECK-BE-NEXT:    nop
-; CHECK-BE-NEXT:    mtctr r0
+; CHECK-BE-NEXT:    mr r0, r1
+; CHECK-BE-NEXT:    stdu r0, -64(r1)
+; CHECK-BE-NEXT:    lis r12, 4
+; CHECK-BE-NEXT:    ori r12, r12, 0
+; CHECK-BE-NEXT:    mtctr r12
 ; CHECK-BE-NEXT:  .LBB6_1: # %entry
 ; CHECK-BE-NEXT:    #
-; CHECK-BE-NEXT:    stdu r12, -4096(r1)
+; CHECK-BE-NEXT:    stdu r0, -4096(r1)
 ; CHECK-BE-NEXT:    bdnz .LBB6_1
 ; CHECK-BE-NEXT:  # %bb.2: # %entry
 ; CHECK-BE-NEXT:    li r3, 3
@@ -367,17 +367,17 @@ define i8 @f6() #0 nounwind {
 ;
 ; CHECK-32-LABEL: f6:
 ; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    mr r12, r1
-; CHECK-32-NEXT:    stwu r12, -16(r1)
-; CHECK-32-NEXT:    lis r0, 4
-; CHECK-32-NEXT:    nop
-; CHECK-32-NEXT:    mtctr r0
+; CHECK-32-NEXT:    mr r0, r1
+; CHECK-32-NEXT:    stwu r0, -16(r1)
+; CHECK-32-NEXT:    lis r12, 4
+; CHECK-32-NEXT:    ori r12, r12, 0
+; CHECK-32-NEXT:    mtctr r12
 ; CHECK-32-NEXT:  .LBB6_1: # %entry
 ; CHECK-32-NEXT:    #
-; CHECK-32-NEXT:    stwu r12, -4096(r1)
+; CHECK-32-NEXT:    stwu r0, -4096(r1)
 ; CHECK-32-NEXT:    bdnz .LBB6_1
 ; CHECK-32-NEXT:  # %bb.2: # %entry
-; CHECK-32-NEXT:    sub r0, r1, r12
+; CHECK-32-NEXT:    sub r0, r1, r0
 ; CHECK-32-NEXT:    li r3, 3
 ; CHECK-32-NEXT:    sub r0, r1, r0
 ; CHECK-32-NEXT:    stb r3, 16(r1)
@@ -398,17 +398,17 @@ entry:
 define i8 @f7() #0 "stack-probe-size"="65536" nounwind {
 ; CHECK-LE-LABEL: f7:
 ; CHECK-LE:       # %bb.0: # %entry
-; CHECK-LE-NEXT:    lis r0, -1
-; CHECK-LE-NEXT:    mr r12, r1
-; CHECK-LE-NEXT:    ori r0, r0, 13776
-; CHECK-LE-NEXT:    stdux r12, r1, r0
-; CHECK-LE-NEXT:    li r0, 15258
-; CHECK-LE-NEXT:    mtctr r0
-; CHECK-LE-NEXT:    lis r0, -1
-; CHECK-LE-NEXT:    nop
+; CHECK-LE-NEXT:    lis r12, -1
+; CHECK-LE-NEXT:    mr r0, r1
+; CHECK-LE-NEXT:    ori r12, r12, 13776
+; CHECK-LE-NEXT:    stdux r0, r1, r12
+; CHECK-LE-NEXT:    li r12, 15258
+; CHECK-LE-NEXT:    mtctr r12
+; CHECK-LE-NEXT:    lis r12, -1
+; CHECK-LE-NEXT:    ori r12, r12, 0
 ; CHECK-LE-NEXT:  .LBB7_1: # %entry
 ; CHECK-LE-NEXT:    #
-; CHECK-LE-NEXT:    stdux r12, r1, r0
+; CHECK-LE-NEXT:    stdux r0, r1, r12
 ; CHECK-LE-NEXT:    bdnz .LBB7_1
 ; CHECK-LE-NEXT:  # %bb.2: # %entry
 ; CHECK-LE-NEXT:    li r3, 3
@@ -419,17 +419,17 @@ define i8 @f7() #0 "stack-probe-size"="65536" nounwind {
 ;
 ; CHECK-BE-LABEL: f7:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    lis r0, -1
-; CHECK-BE-NEXT:    mr r12, r1
-; CHECK-BE-NEXT:    ori r0, r0, 13760
-; CHECK-BE-NEXT:    stdux r12, r1, r0
-; CHECK-BE-NEXT:    li r0, 15258
-; CHECK-BE-NEXT:    mtctr r0
-; CHECK-BE-NEXT:    lis r0, -1
-; CHECK-BE-NEXT:    nop
+; CHECK-BE-NEXT:    lis r12, -1
+; CHECK-BE-NEXT:    mr r0, r1
+; CHECK-BE-NEXT:    ori r12, r12, 13760
+; CHECK-BE-NEXT:    stdux r0, r1, r12
+; CHECK-BE-NEXT:    li r12, 15258
+; CHECK-BE-NEXT:    mtctr r12
+; CHECK-BE-NEXT:    lis r12, -1
+; CHECK-BE-NEXT:    ori r12, r12, 0
 ; CHECK-BE-NEXT:  .LBB7_1: # %entry
 ; CHECK-BE-NEXT:    #
-; CHECK-BE-NEXT:    stdux r12, r1, r0
+; CHECK-BE-NEXT:    stdux r0, r1, r12
 ; CHECK-BE-NEXT:    bdnz .LBB7_1
 ; CHECK-BE-NEXT:  # %bb.2: # %entry
 ; CHECK-BE-NEXT:    li r3, 3
@@ -440,20 +440,20 @@ define i8 @f7() #0 "stack-probe-size"="65536" nounwind {
 ;
 ; CHECK-32-LABEL: f7:
 ; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    lis r0, -1
-; CHECK-32-NEXT:    mr r12, r1
-; CHECK-32-NEXT:    ori r0, r0, 13808
-; CHECK-32-NEXT:    stwux r12, r1, r0
-; CHECK-32-NEXT:    li r0, 15258
-; CHECK-32-NEXT:    mtctr r0
-; CHECK-32-NEXT:    lis r0, -1
-; CHECK-32-NEXT:    nop
+; CHECK-32-NEXT:    lis r12, -1
+; CHECK-32-NEXT:    mr r0, r1
+; CHECK-32-NEXT:    ori r12, r12, 13808
+; CHECK-32-NEXT:    stwux r0, r1, r12
+; CHECK-32-NEXT:    li r12, 15258
+; CHECK-32-NEXT:    mtctr r12
+; CHECK-32-NEXT:    lis r12, -1
+; CHECK-32-NEXT:    ori r12, r12, 0
 ; CHECK-32-NEXT:  .LBB7_1: # %entry
 ; CHECK-32-NEXT:    #
-; CHECK-32-NEXT:    stwux r12, r1, r0
+; CHECK-32-NEXT:    stwux r0, r1, r12
 ; CHECK-32-NEXT:    bdnz .LBB7_1
 ; CHECK-32-NEXT:  # %bb.2: # %entry
-; CHECK-32-NEXT:    sub r0, r1, r12
+; CHECK-32-NEXT:    sub r0, r1, r0
 ; CHECK-32-NEXT:    li r3, 3
 ; CHECK-32-NEXT:    sub r0, r1, r0
 ; CHECK-32-NEXT:    stb r3, 9(r1)
diff --git a/llvm/test/CodeGen/PowerPC/stack-clash-prologue.ll b/llvm/test/CodeGen/PowerPC/stack-clash-prologue.ll
index 6443059c9704..7e4556c59737 100644
--- a/llvm/test/CodeGen/PowerPC/stack-clash-prologue.ll
+++ b/llvm/test/CodeGen/PowerPC/stack-clash-prologue.ll
@@ -44,13 +44,13 @@ entry:
 define i8 @f1() #0 "stack-probe-size"="0" {
 ; CHECK-LE-LABEL: f1:
 ; CHECK-LE:       # %bb.0: # %entry
-; CHECK-LE-NEXT:    mr r12, r1
-; CHECK-LE-NEXT:    .cfi_def_cfa r12, 0
-; CHECK-LE-NEXT:    li r0, 259
-; CHECK-LE-NEXT:    mtctr r0
+; CHECK-LE-NEXT:    mr r0, r1
+; CHECK-LE-NEXT:    .cfi_def_cfa r0, 0
+; CHECK-LE-NEXT:    li r12, 259
+; CHECK-LE-NEXT:    mtctr r12
 ; CHECK-LE-NEXT:  .LBB1_1: # %entry
 ; CHECK-LE-NEXT:    #
-; CHECK-LE-NEXT:    stdu r12, -16(r1)
+; CHECK-LE-NEXT:    stdu r0, -16(r1)
 ; CHECK-LE-NEXT:    bdnz .LBB1_1
 ; CHECK-LE-NEXT:  # %bb.2: # %entry
 ; CHECK-LE-NEXT:    .cfi_def_cfa_register r1
@@ -63,13 +63,13 @@ define i8 @f1() #0 "stack-probe-size"="0" {
 ;
 ; CHECK-BE-LABEL: f1:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    mr r12, r1
-; CHECK-BE-NEXT:    .cfi_def_cfa r12, 0
-; CHECK-BE-NEXT:    li r0, 260
-; CHECK-BE-NEXT:    mtctr r0
+; CHECK-BE-NEXT:    mr r0, r1
+; CHECK-BE-NEXT:    .cfi_def_cfa r0, 0
+; CHECK-BE-NEXT:    li r12, 260
+; CHECK-BE-NEXT:    mtctr r12
 ; CHECK-BE-NEXT:  .LBB1_1: # %entry
 ; CHECK-BE-NEXT:    #
-; CHECK-BE-NEXT:    stdu r12, -16(r1)
+; CHECK-BE-NEXT:    stdu r0, -16(r1)
 ; CHECK-BE-NEXT:    bdnz .LBB1_1
 ; CHECK-BE-NEXT:  # %bb.2: # %entry
 ; CHECK-BE-NEXT:    .cfi_def_cfa_register r1
@@ -82,17 +82,17 @@ define i8 @f1() #0 "stack-probe-size"="0" {
 ;
 ; CHECK-32-LABEL: f1:
 ; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    mr r12, r1
-; CHECK-32-NEXT:    .cfi_def_cfa r12, 0
-; CHECK-32-NEXT:    li r0, 257
-; CHECK-32-NEXT:    mtctr r0
+; CHECK-32-NEXT:    mr r0, r1
+; CHECK-32-NEXT:    .cfi_def_cfa r0, 0
+; CHECK-32-NEXT:    li r12, 257
+; CHECK-32-NEXT:    mtctr r12
 ; CHECK-32-NEXT:  .LBB1_1: # %entry
 ; CHECK-32-NEXT:    #
-; CHECK-32-NEXT:    stwu r12, -16(r1)
+; CHECK-32-NEXT:    stwu r0, -16(r1)
 ; CHECK-32-NEXT:    bdnz .LBB1_1
 ; CHECK-32-NEXT:  # %bb.2: # %entry
 ; CHECK-32-NEXT:    .cfi_def_cfa_register r1
-; CHECK-32-NEXT:    sub r0, r1, r12
+; CHECK-32-NEXT:    sub r0, r1, r0
 ; CHECK-32-NEXT:    sub r0, r1, r0
 ; CHECK-32-NEXT:    .cfi_def_cfa_offset 4112
 ; CHECK-32-NEXT:    li r3, 3
@@ -111,14 +111,14 @@ entry:
 define i8 @f2() #0 {
 ; CHECK-LE-LABEL: f2:
 ; CHECK-LE:       # %bb.0: # %entry
-; CHECK-LE-NEXT:    mr r12, r1
-; CHECK-LE-NEXT:    .cfi_def_cfa r12, 0
-; CHECK-LE-NEXT:    stdu r12, -48(r1)
-; CHECK-LE-NEXT:    li r0, 16
-; CHECK-LE-NEXT:    mtctr r0
+; CHECK-LE-NEXT:    mr r0, r1
+; CHECK-LE-NEXT:    .cfi_def_cfa r0, 0
+; CHECK-LE-NEXT:    stdu r0, -48(r1)
+; CHECK-LE-NEXT:    li r12, 16
+; CHECK-LE-NEXT:    mtctr r12
 ; CHECK-LE-NEXT:  .LBB2_1: # %entry
 ; CHECK-LE-NEXT:    #
-; CHECK-LE-NEXT:    stdu r12, -4096(r1)
+; CHECK-LE-NEXT:    stdu r0, -4096(r1)
 ; CHECK-LE-NEXT:    bdnz .LBB2_1
 ; CHECK-LE-NEXT:  # %bb.2: # %entry
 ; CHECK-LE-NEXT:    .cfi_def_cfa_register r1
@@ -131,14 +131,14 @@ define i8 @f2() #0 {
 ;
 ; CHECK-BE-LABEL: f2:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    mr r12, r1
-; CHECK-BE-NEXT:    .cfi_def_cfa r12, 0
-; CHECK-BE-NEXT:    stdu r12, -64(r1)
-; CHECK-BE-NEXT:    li r0, 16
-; CHECK-BE-NEXT:    mtctr r0
+; CHECK-BE-NEXT:    mr r0, r1
+; CHECK-BE-NEXT:    .cfi_def_cfa r0, 0
+; CHECK-BE-NEXT:    stdu r0, -64(r1)
+; CHECK-BE-NEXT:    li r12, 16
+; CHECK-BE-NEXT:    mtctr r12
 ; CHECK-BE-NEXT:  .LBB2_1: # %entry
 ; CHECK-BE-NEXT:    #
-; CHECK-BE-NEXT:    stdu r12, -4096(r1)
+; CHECK-BE-NEXT:    stdu r0, -4096(r1)
 ; CHECK-BE-NEXT:    bdnz .LBB2_1
 ; CHECK-BE-NEXT:  # %bb.2: # %entry
 ; CHECK-BE-NEXT:    .cfi_def_cfa_register r1
@@ -151,18 +151,18 @@ define i8 @f2() #0 {
 ;
 ; CHECK-32-LABEL: f2:
 ; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    mr r12, r1
-; CHECK-32-NEXT:    .cfi_def_cfa r12, 0
-; CHECK-32-NEXT:    stwu r12, -16(r1)
-; CHECK-32-NEXT:    li r0, 16
-; CHECK-32-NEXT:    mtctr r0
+; CHECK-32-NEXT:    mr r0, r1
+; CHECK-32-NEXT:    .cfi_def_cfa r0, 0
+; CHECK-32-NEXT:    stwu r0, -16(r1)
+; CHECK-32-NEXT:    li r12, 16
+; CHECK-32-NEXT:    mtctr r12
 ; CHECK-32-NEXT:  .LBB2_1: # %entry
 ; CHECK-32-NEXT:    #
-; CHECK-32-NEXT:    stwu r12, -4096(r1)
+; CHECK-32-NEXT:    stwu r0, -4096(r1)
 ; CHECK-32-NEXT:    bdnz .LBB2_1
 ; CHECK-32-NEXT:  # %bb.2: # %entry
 ; CHECK-32-NEXT:    .cfi_def_cfa_register r1
-; CHECK-32-NEXT:    sub r0, r1, r12
+; CHECK-32-NEXT:    sub r0, r1, r0
 ; CHECK-32-NEXT:    sub r0, r1, r0
 ; CHECK-32-NEXT:    .cfi_def_cfa_offset 65552
 ; CHECK-32-NEXT:    li r3, 3
@@ -184,11 +184,11 @@ entry:
 define i8 @f3() #0 "stack-probe-size"="32768" {
 ; CHECK-LE-LABEL: f3:
 ; CHECK-LE:       # %bb.0: # %entry
-; CHECK-LE-NEXT:    mr r12, r1
-; CHECK-LE-NEXT:    .cfi_def_cfa r12, 0
-; CHECK-LE-NEXT:    stdu r12, -48(r1)
-; CHECK-LE-NEXT:    stdu r12, -32768(r1)
-; CHECK-LE-NEXT:    stdu r12, -32768(r1)
+; CHECK-LE-NEXT:    mr r0, r1
+; CHECK-LE-NEXT:    .cfi_def_cfa r0, 0
+; CHECK-LE-NEXT:    stdu r0, -48(r1)
+; CHECK-LE-NEXT:    stdu r0, -32768(r1)
+; CHECK-LE-NEXT:    stdu r0, -32768(r1)
 ; CHECK-LE-NEXT:    .cfi_def_cfa_register r1
 ; CHECK-LE-NEXT:    .cfi_def_cfa_offset 65584
 ; CHECK-LE-NEXT:    li r3, 3
@@ -199,11 +199,11 @@ define i8 @f3() #0 "stack-probe-size"="32768" {
 ;
 ; CHECK-BE-LABEL: f3:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    mr r12, r1
-; CHECK-BE-NEXT:    .cfi_def_cfa r12, 0
-; CHECK-BE-NEXT:    stdu r12, -64(r1)
-; CHECK-BE-NEXT:    stdu r12, -32768(r1)
-; CHECK-BE-NEXT:    stdu r12, -32768(r1)
+; CHECK-BE-NEXT:    mr r0, r1
+; CHECK-BE-NEXT:    .cfi_def_cfa r0, 0
+; CHECK-BE-NEXT:    stdu r0, -64(r1)
+; CHECK-BE-NEXT:    stdu r0, -32768(r1)
+; CHECK-BE-NEXT:    stdu r0, -32768(r1)
 ; CHECK-BE-NEXT:    .cfi_def_cfa_register r1
 ; CHECK-BE-NEXT:    .cfi_def_cfa_offset 65600
 ; CHECK-BE-NEXT:    li r3, 3
@@ -214,13 +214,13 @@ define i8 @f3() #0 "stack-probe-size"="32768" {
 ;
 ; CHECK-32-LABEL: f3:
 ; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    mr r12, r1
-; CHECK-32-NEXT:    .cfi_def_cfa r12, 0
-; CHECK-32-NEXT:    stwu r12, -16(r1)
-; CHECK-32-NEXT:    stwu r12, -32768(r1)
-; CHECK-32-NEXT:    stwu r12, -32768(r1)
+; CHECK-32-NEXT:    mr r0, r1
+; CHECK-32-NEXT:    .cfi_def_cfa r0, 0
+; CHECK-32-NEXT:    stwu r0, -16(r1)
+; CHECK-32-NEXT:    stwu r0, -32768(r1)
+; CHECK-32-NEXT:    stwu r0, -32768(r1)
 ; CHECK-32-NEXT:    .cfi_def_cfa_register r1
-; CHECK-32-NEXT:    sub r0, r1, r12
+; CHECK-32-NEXT:    sub r0, r1, r0
 ; CHECK-32-NEXT:    sub r0, r1, r0
 ; CHECK-32-NEXT:    .cfi_def_cfa_offset 65552
 ; CHECK-32-NEXT:    li r3, 3
@@ -291,16 +291,16 @@ entry:
 define i8 @f5() #0 "stack-probe-size"="65536" {
 ; CHECK-LE-LABEL: f5:
 ; CHECK-LE:       # %bb.0: # %entry
-; CHECK-LE-NEXT:    mr r12, r1
-; CHECK-LE-NEXT:    .cfi_def_cfa r12, 0
-; CHECK-LE-NEXT:    stdu r12, -48(r1)
-; CHECK-LE-NEXT:    li r0, 16
-; CHECK-LE-NEXT:    mtctr r0
-; CHECK-LE-NEXT:    lis r0, -1
-; CHECK-LE-NEXT:    nop
+; CHECK-LE-NEXT:    mr r0, r1
+; CHECK-LE-NEXT:    .cfi_def_cfa r0, 0
+; CHECK-LE-NEXT:    stdu r0, -48(r1)
+; CHECK-LE-NEXT:    li r12, 16
+; CHECK-LE-NEXT:    mtctr r12
+; CHECK-LE-NEXT:    lis r12, -1
+; CHECK-LE-NEXT:    ori r12, r12, 0
 ; CHECK-LE-NEXT:  .LBB5_1: # %entry
 ; CHECK-LE-NEXT:    #
-; CHECK-LE-NEXT:    stdux r12, r1, r0
+; CHECK-LE-NEXT:    stdux r0, r1, r12
 ; CHECK-LE-NEXT:    bdnz .LBB5_1
 ; CHECK-LE-NEXT:  # %bb.2: # %entry
 ; CHECK-LE-NEXT:    .cfi_def_cfa_register r1
@@ -313,16 +313,16 @@ define i8 @f5() #0 "stack-probe-size"="65536" {
 ;
 ; CHECK-BE-LABEL: f5:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    mr r12, r1
-; CHECK-BE-NEXT:    .cfi_def_cfa r12, 0
-; CHECK-BE-NEXT:    stdu r12, -64(r1)
-; CHECK-BE-NEXT:    li r0, 16
-; CHECK-BE-NEXT:    mtctr r0
-; CHECK-BE-NEXT:    lis r0, -1
-; CHECK-BE-NEXT:    nop
+; CHECK-BE-NEXT:    mr r0, r1
+; CHECK-BE-NEXT:    .cfi_def_cfa r0, 0
+; CHECK-BE-NEXT:    stdu r0, -64(r1)
+; CHECK-BE-NEXT:    li r12, 16
+; CHECK-BE-NEXT:    mtctr r12
+; CHECK-BE-NEXT:    lis r12, -1
+; CHECK-BE-NEXT:    ori r12, r12, 0
 ; CHECK-BE-NEXT:  .LBB5_1: # %entry
 ; CHECK-BE-NEXT:    #
-; CHECK-BE-NEXT:    stdux r12, r1, r0
+; CHECK-BE-NEXT:    stdux r0, r1, r12
 ; CHECK-BE-NEXT:    bdnz .LBB5_1
 ; CHECK-BE-NEXT:  # %bb.2: # %entry
 ; CHECK-BE-NEXT:    .cfi_def_cfa_register r1
@@ -335,20 +335,20 @@ define i8 @f5() #0 "stack-probe-size"="65536" {
 ;
 ; CHECK-32-LABEL: f5:
 ; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    mr r12, r1
-; CHECK-32-NEXT:    .cfi_def_cfa r12, 0
-; CHECK-32-NEXT:    stwu r12, -16(r1)
-; CHECK-32-NEXT:    li r0, 16
-; CHECK-32-NEXT:    mtctr r0
-; CHECK-32-NEXT:    lis r0, -1
-; CHECK-32-NEXT:    nop
+; CHECK-32-NEXT:    mr r0, r1
+; CHECK-32-NEXT:    .cfi_def_cfa r0, 0
+; CHECK-32-NEXT:    stwu r0, -16(r1)
+; CHECK-32-NEXT:    li r12, 16
+; CHECK-32-NEXT:    mtctr r12
+; CHECK-32-NEXT:    lis r12, -1
+; CHECK-32-NEXT:    ori r12, r12, 0
 ; CHECK-32-NEXT:  .LBB5_1: # %entry
 ; CHECK-32-NEXT:    #
-; CHECK-32-NEXT:    stwux r12, r1, r0
+; CHECK-32-NEXT:    stwux r0, r1, r12
 ; CHECK-32-NEXT:    bdnz .LBB5_1
 ; CHECK-32-NEXT:  # %bb.2: # %entry
 ; CHECK-32-NEXT:    .cfi_def_cfa_register r1
-; CHECK-32-NEXT:    sub r0, r1, r12
+; CHECK-32-NEXT:    sub r0, r1, r0
 ; CHECK-32-NEXT:    sub r0, r1, r0
 ; CHECK-32-NEXT:    .cfi_def_cfa_offset 1048592
 ; CHECK-32-NEXT:    li r3, 3
@@ -370,15 +370,15 @@ entry:
 define i8 @f6() #0 {
 ; CHECK-LE-LABEL: f6:
 ; CHECK-LE:       # %bb.0: # %entry
-; CHECK-LE-NEXT:    mr r12, r1
-; CHECK-LE-NEXT:    .cfi_def_cfa r12, 0
-; CHECK-LE-NEXT:    stdu r12, -48(r1)
-; CHECK-LE-NEXT:    lis r0, 4
-; CHECK-LE-NEXT:    nop
-; CHECK-LE-NEXT:    mtctr r0
+; CHECK-LE-NEXT:    mr r0, r1
+; CHECK-LE-NEXT:    .cfi_def_cfa r0, 0
+; CHECK-LE-NEXT:    stdu r0, -48(r1)
+; CHECK-LE-NEXT:    lis r12, 4
+; CHECK-LE-NEXT:    ori r12, r12, 0
+; CHECK-LE-NEXT:    mtctr r12
 ; CHECK-LE-NEXT:  .LBB6_1: # %entry
 ; CHECK-LE-NEXT:    #
-; CHECK-LE-NEXT:    stdu r12, -4096(r1)
+; CHECK-LE-NEXT:    stdu r0, -4096(r1)
 ; CHECK-LE-NEXT:    bdnz .LBB6_1
 ; CHECK-LE-NEXT:  # %bb.2: # %entry
 ; CHECK-LE-NEXT:    .cfi_def_cfa_register r1
@@ -391,15 +391,15 @@ define i8 @f6() #0 {
 ;
 ; CHECK-BE-LABEL: f6:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    mr r12, r1
-; CHECK-BE-NEXT:    .cfi_def_cfa r12, 0
-; CHECK-BE-NEXT:    stdu r12, -64(r1)
-; CHECK-BE-NEXT:    lis r0, 4
-; CHECK-BE-NEXT:    nop
-; CHECK-BE-NEXT:    mtctr r0
+; CHECK-BE-NEXT:    mr r0, r1
+; CHECK-BE-NEXT:    .cfi_def_cfa r0, 0
+; CHECK-BE-NEXT:    stdu r0, -64(r1)
+; CHECK-BE-NEXT:    lis r12, 4
+; CHECK-BE-NEXT:    ori r12, r12, 0
+; CHECK-BE-NEXT:    mtctr r12
 ; CHECK-BE-NEXT:  .LBB6_1: # %entry
 ; CHECK-BE-NEXT:    #
-; CHECK-BE-NEXT:    stdu r12, -4096(r1)
+; CHECK-BE-NEXT:    stdu r0, -4096(r1)
 ; CHECK-BE-NEXT:    bdnz .LBB6_1
 ; CHECK-BE-NEXT:  # %bb.2: # %entry
 ; CHECK-BE-NEXT:    .cfi_def_cfa_register r1
@@ -412,19 +412,19 @@ define i8 @f6() #0 {
 ;
 ; CHECK-32-LABEL: f6:
 ; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    mr r12, r1
-; CHECK-32-NEXT:    .cfi_def_cfa r12, 0
-; CHECK-32-NEXT:    stwu r12, -16(r1)
-; CHECK-32-NEXT:    lis r0, 4
-; CHECK-32-NEXT:    nop
-; CHECK-32-NEXT:    mtctr r0
+; CHECK-32-NEXT:    mr r0, r1
+; CHECK-32-NEXT:    .cfi_def_cfa r0, 0
+; CHECK-32-NEXT:    stwu r0, -16(r1)
+; CHECK-32-NEXT:    lis r12, 4
+; CHECK-32-NEXT:    ori r12, r12, 0
+; CHECK-32-NEXT:    mtctr r12
 ; CHECK-32-NEXT:  .LBB6_1: # %entry
 ; CHECK-32-NEXT:    #
-; CHECK-32-NEXT:    stwu r12, -4096(r1)
+; CHECK-32-NEXT:    stwu r0, -4096(r1)
 ; CHECK-32-NEXT:    bdnz .LBB6_1
 ; CHECK-32-NEXT:  # %bb.2: # %entry
 ; CHECK-32-NEXT:    .cfi_def_cfa_register r1
-; CHECK-32-NEXT:    sub r0, r1, r12
+; CHECK-32-NEXT:    sub r0, r1, r0
 ; CHECK-32-NEXT:    sub r0, r1, r0
 ; CHECK-32-NEXT:    .cfi_def_cfa_offset 1073741840
 ; CHECK-32-NEXT:    li r3, 3
@@ -446,18 +446,18 @@ entry:
 define i8 @f7() #0 "stack-probe-size"="65536" {
 ; CHECK-LE-LABEL: f7:
 ; CHECK-LE:       # %bb.0: # %entry
-; CHECK-LE-NEXT:    mr r12, r1
-; CHECK-LE-NEXT:    .cfi_def_cfa r12, 0
-; CHECK-LE-NEXT:    lis r0, -1
-; CHECK-LE-NEXT:    ori r0, r0, 13776
-; CHECK-LE-NEXT:    stdux r12, r1, r0
-; CHECK-LE-NEXT:    li r0, 15258
-; CHECK-LE-NEXT:    mtctr r0
-; CHECK-LE-NEXT:    lis r0, -1
-; CHECK-LE-NEXT:    nop
+; CHECK-LE-NEXT:    mr r0, r1
+; CHECK-LE-NEXT:    .cfi_def_cfa r0, 0
+; CHECK-LE-NEXT:    lis r12, -1
+; CHECK-LE-NEXT:    ori r12, r12, 13776
+; CHECK-LE-NEXT:    stdux r0, r1, r12
+; CHECK-LE-NEXT:    li r12, 15258
+; CHECK-LE-NEXT:    mtctr r12
+; CHECK-LE-NEXT:    lis r12, -1
+; CHECK-LE-NEXT:    ori r12, r12, 0
 ; CHECK-LE-NEXT:  .LBB7_1: # %entry
 ; CHECK-LE-NEXT:    #
-; CHECK-LE-NEXT:    stdux r12, r1, r0
+; CHECK-LE-NEXT:    stdux r0, r1, r12
 ; CHECK-LE-NEXT:    bdnz .LBB7_1
 ; CHECK-LE-NEXT:  # %bb.2: # %entry
 ; CHECK-LE-NEXT:    .cfi_def_cfa_register r1
@@ -470,18 +470,18 @@ define i8 @f7() #0 "stack-probe-size"="65536" {
 ;
 ; CHECK-BE-LABEL: f7:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    mr r12, r1
-; CHECK-BE-NEXT:    .cfi_def_cfa r12, 0
-; CHECK-BE-NEXT:    lis r0, -1
-; CHECK-BE-NEXT:    ori r0, r0, 13760
-; CHECK-BE-NEXT:    stdux r12, r1, r0
-; CHECK-BE-NEXT:    li r0, 15258
-; CHECK-BE-NEXT:    mtctr r0
-; CHECK-BE-NEXT:    lis r0, -1
-; CHECK-BE-NEXT:    nop
+; CHECK-BE-NEXT:    mr r0, r1
+; CHECK-BE-NEXT:    .cfi_def_cfa r0, 0
+; CHECK-BE-NEXT:    lis r12, -1
+; CHECK-BE-NEXT:    ori r12, r12, 13760
+; CHECK-BE-NEXT:    stdux r0, r1, r12
+; CHECK-BE-NEXT:    li r12, 15258
+; CHECK-BE-NEXT:    mtctr r12
+; CHECK-BE-NEXT:    lis r12, -1
+; CHECK-BE-NEXT:    ori r12, r12, 0
 ; CHECK-BE-NEXT:  .LBB7_1: # %entry
 ; CHECK-BE-NEXT:    #
-; CHECK-BE-NEXT:    stdux r12, r1, r0
+; CHECK-BE-NEXT:    stdux r0, r1, r12
 ; CHECK-BE-NEXT:    bdnz .LBB7_1
 ; CHECK-BE-NEXT:  # %bb.2: # %entry
 ; CHECK-BE-NEXT:    .cfi_def_cfa_register r1
@@ -494,22 +494,22 @@ define i8 @f7() #0 "stack-probe-size"="65536" {
 ;
 ; CHECK-32-LABEL: f7:
 ; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    mr r12, r1
-; CHECK-32-NEXT:    .cfi_def_cfa r12, 0
-; CHECK-32-NEXT:    lis r0, -1
-; CHECK-32-NEXT:    ori r0, r0, 13808
-; CHECK-32-NEXT:    stwux r12, r1, r0
-; CHECK-32-NEXT:    li r0, 15258
-; CHECK-32-NEXT:    mtctr r0
-; CHECK-32-NEXT:    lis r0, -1
-; CHECK-32-NEXT:    nop
+; CHECK-32-NEXT:    mr r0, r1
+; CHECK-32-NEXT:    .cfi_def_cfa r0, 0
+; CHECK-32-NEXT:    lis r12, -1
+; CHECK-32-NEXT:    ori r12, r12, 13808
+; CHECK-32-NEXT:    stwux r0, r1, r12
+; CHECK-32-NEXT:    li r12, 15258
+; CHECK-32-NEXT:    mtctr r12
+; CHECK-32-NEXT:    lis r12, -1
+; CHECK-32-NEXT:    ori r12, r12, 0
 ; CHECK-32-NEXT:  .LBB7_1: # %entry
 ; CHECK-32-NEXT:    #
-; CHECK-32-NEXT:    stwux r12, r1, r0
+; CHECK-32-NEXT:    stwux r0, r1, r12
 ; CHECK-32-NEXT:    bdnz .LBB7_1
 ; CHECK-32-NEXT:  # %bb.2: # %entry
 ; CHECK-32-NEXT:    .cfi_def_cfa_register r1
-; CHECK-32-NEXT:    sub r0, r1, r12
+; CHECK-32-NEXT:    sub r0, r1, r0
 ; CHECK-32-NEXT:    sub r0, r1, r0
 ; CHECK-32-NEXT:    .cfi_def_cfa_offset 1000000016
 ; CHECK-32-NEXT:    li r3, 3
@@ -599,31 +599,24 @@ define i32 @f8(i64 %i) local_unnamed_addr #0 {
 define i32 @f9(i64 %i) local_unnamed_addr #0 {
 ; CHECK-LE-LABEL: f9:
 ; CHECK-LE:       # %bb.0:
+; CHECK-LE-NEXT:    clrldi r12, r1, 53
 ; CHECK-LE-NEXT:    std r30, -16(r1)
 ; CHECK-LE-NEXT:    mr r30, r1
-; CHECK-LE-NEXT:    .cfi_def_cfa r30, 0
-; CHECK-LE-NEXT:    clrldi r0, r30, 53
-; CHECK-LE-NEXT:    subc r12, r30, r0
-; CHECK-LE-NEXT:    clrldi r0, r0, 52
-; CHECK-LE-NEXT:    cmpdi r0, 0
-; CHECK-LE-NEXT:    beq cr0, .LBB9_2
-; CHECK-LE-NEXT:  # %bb.1:
-; CHECK-LE-NEXT:    neg r0, r0
-; CHECK-LE-NEXT:    stdux r30, r1, r0
+; CHECK-LE-NEXT:    sub r0, r1, r12
+; CHECK-LE-NEXT:    li r12, -10240
+; CHECK-LE-NEXT:    add r0, r12, r0
+; CHECK-LE-NEXT:    sub r12, r0, r1
+; CHECK-LE-NEXT:    cmpdi r12, -4096
+; CHECK-LE-NEXT:    bge cr0, .LBB9_2
+; CHECK-LE-NEXT:  .LBB9_1:
+; CHECK-LE-NEXT:    stdu r30, -4096(r1)
+; CHECK-LE-NEXT:    addi r12, r12, 4096
+; CHECK-LE-NEXT:    cmpdi r12, -4096
+; CHECK-LE-NEXT:    blt cr0, .LBB9_1
 ; CHECK-LE-NEXT:  .LBB9_2:
-; CHECK-LE-NEXT:    li r0, -4096
-; CHECK-LE-NEXT:    cmpd r1, r12
-; CHECK-LE-NEXT:    beq cr0, .LBB9_4
-; CHECK-LE-NEXT:  .LBB9_3:
-; CHECK-LE-NEXT:    stdux r30, r1, r0
-; CHECK-LE-NEXT:    cmpd r1, r12
-; CHECK-LE-NEXT:    bne cr0, .LBB9_3
-; CHECK-LE-NEXT:  .LBB9_4:
-; CHECK-LE-NEXT:    mr r12, r30
-; CHECK-LE-NEXT:    stdu r12, -2048(r1)
-; CHECK-LE-NEXT:    stdu r12, -4096(r1)
-; CHECK-LE-NEXT:    stdu r12, -4096(r1)
-; CHECK-LE-NEXT:    .cfi_def_cfa_register r1
+; CHECK-LE-NEXT:    stdux r30, r1, r12
+; CHECK-LE-NEXT:    mr r0, r30
+; CHECK-LE-NEXT:    .cfi_def_cfa_register r0
 ; CHECK-LE-NEXT:    .cfi_def_cfa_register r30
 ; CHECK-LE-NEXT:    .cfi_offset r30, -16
 ; CHECK-LE-NEXT:    addi r4, r1, 2048
@@ -637,31 +630,24 @@ define i32 @f9(i64 %i) local_unnamed_addr #0 {
 ;
 ; CHECK-BE-LABEL: f9:
 ; CHECK-BE:       # %bb.0:
+; CHECK-BE-NEXT:    clrldi r12, r1, 53
 ; CHECK-BE-NEXT:    std r30, -16(r1)
 ; CHECK-BE-NEXT:    mr r30, r1
-; CHECK-BE-NEXT:    .cfi_def_cfa r30, 0
-; CHECK-BE-NEXT:    clrldi r0, r30, 53
-; CHECK-BE-NEXT:    subc r12, r30, r0
-; CHECK-BE-NEXT:    clrldi r0, r0, 52
-; CHECK-BE-NEXT:    cmpdi r0, 0
-; CHECK-BE-NEXT:    beq cr0, .LBB9_2
-; CHECK-BE-NEXT:  # %bb.1:
-; CHECK-BE-NEXT:    neg r0, r0
-; CHECK-BE-NEXT:    stdux r30, r1, r0
+; CHECK-BE-NEXT:    sub r0, r1, r12
+; CHECK-BE-NEXT:    li r12, -10240
+; CHECK-BE-NEXT:    add r0, r12, r0
+; CHECK-BE-NEXT:    sub r12, r0, r1
+; CHECK-BE-NEXT:    cmpdi r12, -4096
+; CHECK-BE-NEXT:    bge cr0, .LBB9_2
+; CHECK-BE-NEXT:  .LBB9_1:
+; CHECK-BE-NEXT:    stdu r30, -4096(r1)
+; CHECK-BE-NEXT:    addi r12, r12, 4096
+; CHECK-BE-NEXT:    cmpdi r12, -4096
+; CHECK-BE-NEXT:    blt cr0, .LBB9_1
 ; CHECK-BE-NEXT:  .LBB9_2:
-; CHECK-BE-NEXT:    li r0, -4096
-; CHECK-BE-NEXT:    cmpd r1, r12
-; CHECK-BE-NEXT:    beq cr0, .LBB9_4
-; CHECK-BE-NEXT:  .LBB9_3:
-; CHECK-BE-NEXT:    stdux r30, r1, r0
-; CHECK-BE-NEXT:    cmpd r1, r12
-; CHECK-BE-NEXT:    bne cr0, .LBB9_3
-; CHECK-BE-NEXT:  .LBB9_4:
-; CHECK-BE-NEXT:    mr r12, r30
-; CHECK-BE-NEXT:    stdu r12, -2048(r1)
-; CHECK-BE-NEXT:    stdu r12, -4096(r1)
-; CHECK-BE-NEXT:    stdu r12, -4096(r1)
-; CHECK-BE-NEXT:    .cfi_def_cfa_register r1
+; CHECK-BE-NEXT:    stdux r30, r1, r12
+; CHECK-BE-NEXT:    mr r0, r30
+; CHECK-BE-NEXT:    .cfi_def_cfa_register r0
 ; CHECK-BE-NEXT:    .cfi_def_cfa_register r30
 ; CHECK-BE-NEXT:    .cfi_offset r30, -16
 ; CHECK-BE-NEXT:    addi r4, r1, 2048
@@ -675,15 +661,23 @@ define i32 @f9(i64 %i) local_unnamed_addr #0 {
 ;
 ; CHECK-32-LABEL: f9:
 ; CHECK-32:       # %bb.0:
-; CHECK-32-NEXT:    mr r12, r1
-; CHECK-32-NEXT:    .cfi_def_cfa r12, 0
-; CHECK-32-NEXT:    clrlwi r0, r12, 21
-; CHECK-32-NEXT:    subc r1, r1, r0
-; CHECK-32-NEXT:    stwu r12, -2048(r1)
-; CHECK-32-NEXT:    stwu r12, -4096(r1)
-; CHECK-32-NEXT:    stwu r12, -4096(r1)
-; CHECK-32-NEXT:    .cfi_def_cfa_register r1
+; CHECK-32-NEXT:    clrlwi r12, r1, 21
 ; CHECK-32-NEXT:    sub r0, r1, r12
+; CHECK-32-NEXT:    li r12, -10240
+; CHECK-32-NEXT:    add r0, r12, r0
+; CHECK-32-NEXT:    sub r12, r0, r1
+; CHECK-32-NEXT:    mr r0, r1
+; CHECK-32-NEXT:    cmpwi r12, -4096
+; CHECK-32-NEXT:    bge cr0, .LBB9_2
+; CHECK-32-NEXT:  .LBB9_1:
+; CHECK-32-NEXT:    stwu r0, -4096(r1)
+; CHECK-32-NEXT:    addi r12, r12, 4096
+; CHECK-32-NEXT:    cmpwi r12, -4096
+; CHECK-32-NEXT:    blt cr0, .LBB9_1
+; CHECK-32-NEXT:  .LBB9_2:
+; CHECK-32-NEXT:    stwux r0, r1, r12
+; CHECK-32-NEXT:    .cfi_def_cfa_register r0
+; CHECK-32-NEXT:    sub r0, r1, r0
 ; CHECK-32-NEXT:    sub r0, r1, r0
 ; CHECK-32-NEXT:    addic r0, r0, -8
 ; CHECK-32-NEXT:    stwx r30, 0, r0
@@ -712,30 +706,24 @@ define i32 @f9(i64 %i) local_unnamed_addr #0 {
 define i32 @f10(i64 %i) local_unnamed_addr #0 {
 ; CHECK-LE-LABEL: f10:
 ; CHECK-LE:       # %bb.0:
+; CHECK-LE-NEXT:    clrldi r12, r1, 54
 ; CHECK-LE-NEXT:    std r30, -16(r1)
 ; CHECK-LE-NEXT:    mr r30, r1
-; CHECK-LE-NEXT:    .cfi_def_cfa r30, 0
-; CHECK-LE-NEXT:    clrldi r0, r30, 54
-; CHECK-LE-NEXT:    subc r12, r30, r0
-; CHECK-LE-NEXT:    clrldi r0, r0, 52
-; CHECK-LE-NEXT:    cmpdi r0, 0
-; CHECK-LE-NEXT:    beq cr0, .LBB10_2
-; CHECK-LE-NEXT:  # %bb.1:
-; CHECK-LE-NEXT:    neg r0, r0
-; CHECK-LE-NEXT:    stdux r30, r1, r0
+; CHECK-LE-NEXT:    sub r0, r1, r12
+; CHECK-LE-NEXT:    li r12, -5120
+; CHECK-LE-NEXT:    add r0, r12, r0
+; CHECK-LE-NEXT:    sub r12, r0, r1
+; CHECK-LE-NEXT:    cmpdi r12, -4096
+; CHECK-LE-NEXT:    bge cr0, .LBB10_2
+; CHECK-LE-NEXT:  .LBB10_1:
+; CHECK-LE-NEXT:    stdu r30, -4096(r1)
+; CHECK-LE-NEXT:    addi r12, r12, 4096
+; CHECK-LE-NEXT:    cmpdi r12, -4096
+; CHECK-LE-NEXT:    blt cr0, .LBB10_1
 ; CHECK-LE-NEXT:  .LBB10_2:
-; CHECK-LE-NEXT:    li r0, -4096
-; CHECK-LE-NEXT:    cmpd r1, r12
-; CHECK-LE-NEXT:    beq cr0, .LBB10_4
-; CHECK-LE-NEXT:  .LBB10_3:
-; CHECK-LE-NEXT:    stdux r30, r1, r0
-; CHECK-LE-NEXT:    cmpd r1, r12
-; CHECK-LE-NEXT:    bne cr0, .LBB10_3
-; CHECK-LE-NEXT:  .LBB10_4:
-; CHECK-LE-NEXT:    mr r12, r30
-; CHECK-LE-NEXT:    stdu r12, -1024(r1)
-; CHECK-LE-NEXT:    stdu r12, -4096(r1)
-; CHECK-LE-NEXT:    .cfi_def_cfa_register r1
+; CHECK-LE-NEXT:    stdux r30, r1, r12
+; CHECK-LE-NEXT:    mr r0, r30
+; CHECK-LE-NEXT:    .cfi_def_cfa_register r0
 ; CHECK-LE-NEXT:    .cfi_def_cfa_register r30
 ; CHECK-LE-NEXT:    .cfi_offset r30, -16
 ; CHECK-LE-NEXT:    addi r4, r1, 1024
@@ -749,30 +737,24 @@ define i32 @f10(i64 %i) local_unnamed_addr #0 {
 ;
 ; CHECK-BE-LABEL: f10:
 ; CHECK-BE:       # %bb.0:
+; CHECK-BE-NEXT:    clrldi r12, r1, 54
 ; CHECK-BE-NEXT:    std r30, -16(r1)
 ; CHECK-BE-NEXT:    mr r30, r1
-; CHECK-BE-NEXT:    .cfi_def_cfa r30, 0
-; CHECK-BE-NEXT:    clrldi r0, r30, 54
-; CHECK-BE-NEXT:    subc r12, r30, r0
-; CHECK-BE-NEXT:    clrldi r0, r0, 52
-; CHECK-BE-NEXT:    cmpdi r0, 0
-; CHECK-BE-NEXT:    beq cr0, .LBB10_2
-; CHECK-BE-NEXT:  # %bb.1:
-; CHECK-BE-NEXT:    neg r0, r0
-; CHECK-BE-NEXT:    stdux r30, r1, r0
+; CHECK-BE-NEXT:    sub r0, r1, r12
+; CHECK-BE-NEXT:    li r12, -5120
+; CHECK-BE-NEXT:    add r0, r12, r0
+; CHECK-BE-NEXT:    sub r12, r0, r1
+; CHECK-BE-NEXT:    cmpdi r12, -4096
+; CHECK-BE-NEXT:    bge cr0, .LBB10_2
+; CHECK-BE-NEXT:  .LBB10_1:
+; CHECK-BE-NEXT:    stdu r30, -4096(r1)
+; CHECK-BE-NEXT:    addi r12, r12, 4096
+; CHECK-BE-NEXT:    cmpdi r12, -4096
+; CHECK-BE-NEXT:    blt cr0, .LBB10_1
 ; CHECK-BE-NEXT:  .LBB10_2:
-; CHECK-BE-NEXT:    li r0, -4096
-; CHECK-BE-NEXT:    cmpd r1, r12
-; CHECK-BE-NEXT:    beq cr0, .LBB10_4
-; CHECK-BE-NEXT:  .LBB10_3:
-; CHECK-BE-NEXT:    stdux r30, r1, r0
-; CHECK-BE-NEXT:    cmpd r1, r12
-; CHECK-BE-NEXT:    bne cr0, .LBB10_3
-; CHECK-BE-NEXT:  .LBB10_4:
-; CHECK-BE-NEXT:    mr r12, r30
-; CHECK-BE-NEXT:    stdu r12, -1024(r1)
-; CHECK-BE-NEXT:    stdu r12, -4096(r1)
-; CHECK-BE-NEXT:    .cfi_def_cfa_register r1
+; CHECK-BE-NEXT:    stdux r30, r1, r12
+; CHECK-BE-NEXT:    mr r0, r30
+; CHECK-BE-NEXT:    .cfi_def_cfa_register r0
 ; CHECK-BE-NEXT:    .cfi_def_cfa_register r30
 ; CHECK-BE-NEXT:    .cfi_offset r30, -16
 ; CHECK-BE-NEXT:    addi r4, r1, 1024
@@ -786,14 +768,23 @@ define i32 @f10(i64 %i) local_unnamed_addr #0 {
 ;
 ; CHECK-32-LABEL: f10:
 ; CHECK-32:       # %bb.0:
-; CHECK-32-NEXT:    mr r12, r1
-; CHECK-32-NEXT:    .cfi_def_cfa r12, 0
-; CHECK-32-NEXT:    clrlwi r0, r12, 22
-; CHECK-32-NEXT:    subc r1, r1, r0
-; CHECK-32-NEXT:    stwu r12, -1024(r1)
-; CHECK-32-NEXT:    stwu r12, -4096(r1)
-; CHECK-32-NEXT:    .cfi_def_cfa_register r1
+; CHECK-32-NEXT:    clrlwi r12, r1, 22
 ; CHECK-32-NEXT:    sub r0, r1, r12
+; CHECK-32-NEXT:    li r12, -5120
+; CHECK-32-NEXT:    add r0, r12, r0
+; CHECK-32-NEXT:    sub r12, r0, r1
+; CHECK-32-NEXT:    mr r0, r1
+; CHECK-32-NEXT:    cmpwi r12, -4096
+; CHECK-32-NEXT:    bge cr0, .LBB10_2
+; CHECK-32-NEXT:  .LBB10_1:
+; CHECK-32-NEXT:    stwu r0, -4096(r1)
+; CHECK-32-NEXT:    addi r12, r12, 4096
+; CHECK-32-NEXT:    cmpwi r12, -4096
+; CHECK-32-NEXT:    blt cr0, .LBB10_1
+; CHECK-32-NEXT:  .LBB10_2:
+; CHECK-32-NEXT:    stwux r0, r1, r12
+; CHECK-32-NEXT:    .cfi_def_cfa_register r0
+; CHECK-32-NEXT:    sub r0, r1, r0
 ; CHECK-32-NEXT:    sub r0, r1, r0
 ; CHECK-32-NEXT:    addic r0, r0, -8
 ; CHECK-32-NEXT:    stwx r30, 0, r0
@@ -821,35 +812,26 @@ define i32 @f10(i64 %i) local_unnamed_addr #0 {
 define void @f11(i32 %vla_size, i64 %i) #0 {
 ; CHECK-LE-LABEL: f11:
 ; CHECK-LE:       # %bb.0:
+; CHECK-LE-NEXT:    clrldi r12, r1, 49
 ; CHECK-LE-NEXT:    std r31, -8(r1)
 ; CHECK-LE-NEXT:    std r30, -16(r1)
 ; CHECK-LE-NEXT:    mr r30, r1
-; CHECK-LE-NEXT:    .cfi_def_cfa r30, 0
-; CHECK-LE-NEXT:    clrldi r0, r30, 49
-; CHECK-LE-NEXT:    subc r12, r30, r0
-; CHECK-LE-NEXT:    clrldi r0, r0, 52
-; CHECK-LE-NEXT:    cmpdi r0, 0
-; CHECK-LE-NEXT:    beq cr0, .LBB11_2
-; CHECK-LE-NEXT:  # %bb.1:
-; CHECK-LE-NEXT:    neg r0, r0
-; CHECK-LE-NEXT:    stdux r30, r1, r0
+; CHECK-LE-NEXT:    sub r0, r1, r12
+; CHECK-LE-NEXT:    lis r12, -2
+; CHECK-LE-NEXT:    ori r12, r12, 32768
+; CHECK-LE-NEXT:    add r0, r12, r0
+; CHECK-LE-NEXT:    sub r12, r0, r1
+; CHECK-LE-NEXT:    cmpdi r12, -4096
+; CHECK-LE-NEXT:    bge cr0, .LBB11_2
+; CHECK-LE-NEXT:  .LBB11_1:
+; CHECK-LE-NEXT:    stdu r30, -4096(r1)
+; CHECK-LE-NEXT:    addi r12, r12, 4096
+; CHECK-LE-NEXT:    cmpdi r12, -4096
+; CHECK-LE-NEXT:    blt cr0, .LBB11_1
 ; CHECK-LE-NEXT:  .LBB11_2:
-; CHECK-LE-NEXT:    li r0, -4096
-; CHECK-LE-NEXT:    cmpd r1, r12
-; CHECK-LE-NEXT:    beq cr0, .LBB11_4
-; CHECK-LE-NEXT:  .LBB11_3:
-; CHECK-LE-NEXT:    stdux r30, r1, r0
-; CHECK-LE-NEXT:    cmpd r1, r12
-; CHECK-LE-NEXT:    bne cr0, .LBB11_3
-; CHECK-LE-NEXT:  .LBB11_4:
-; CHECK-LE-NEXT:    mr r12, r30
-; CHECK-LE-NEXT:    li r0, 24
-; CHECK-LE-NEXT:    mtctr r0
-; CHECK-LE-NEXT:  .LBB11_5:
-; CHECK-LE-NEXT:    stdu r12, -4096(r1)
-; CHECK-LE-NEXT:    bdnz .LBB11_5
-; CHECK-LE-NEXT:  # %bb.6:
-; CHECK-LE-NEXT:    .cfi_def_cfa_register r1
+; CHECK-LE-NEXT:    stdux r30, r1, r12
+; CHECK-LE-NEXT:    mr r0, r30
+; CHECK-LE-NEXT:    .cfi_def_cfa_register r0
 ; CHECK-LE-NEXT:    .cfi_def_cfa_register r30
 ; CHECK-LE-NEXT:    .cfi_offset r31, -8
 ; CHECK-LE-NEXT:    .cfi_offset r30, -16
@@ -876,12 +858,12 @@ define void @f11(i32 %vla_size, i64 %i) #0 {
 ; CHECK-LE-NEXT:    add r4, r1, r7
 ; CHECK-LE-NEXT:    stdux r3, r1, r5
 ; CHECK-LE-NEXT:    cmpd r1, r4
-; CHECK-LE-NEXT:    beq cr0, .LBB11_8
-; CHECK-LE-NEXT:  .LBB11_7:
+; CHECK-LE-NEXT:    beq cr0, .LBB11_4
+; CHECK-LE-NEXT:  .LBB11_3:
 ; CHECK-LE-NEXT:    stdu r3, -4096(r1)
 ; CHECK-LE-NEXT:    cmpd r1, r4
-; CHECK-LE-NEXT:    bne cr0, .LBB11_7
-; CHECK-LE-NEXT:  .LBB11_8:
+; CHECK-LE-NEXT:    bne cr0, .LBB11_3
+; CHECK-LE-NEXT:  .LBB11_4:
 ; CHECK-LE-NEXT:    addi r3, r1, -32768
 ; CHECK-LE-NEXT:    lbz r3, 0(r3)
 ; CHECK-LE-NEXT:    mr r1, r30
@@ -891,35 +873,26 @@ define void @f11(i32 %vla_size, i64 %i) #0 {
 ;
 ; CHECK-BE-LABEL: f11:
 ; CHECK-BE:       # %bb.0:
+; CHECK-BE-NEXT:    clrldi r12, r1, 49
 ; CHECK-BE-NEXT:    std r31, -8(r1)
 ; CHECK-BE-NEXT:    std r30, -16(r1)
 ; CHECK-BE-NEXT:    mr r30, r1
-; CHECK-BE-NEXT:    .cfi_def_cfa r30, 0
-; CHECK-BE-NEXT:    clrldi r0, r30, 49
-; CHECK-BE-NEXT:    subc r12, r30, r0
-; CHECK-BE-NEXT:    clrldi r0, r0, 52
-; CHECK-BE-NEXT:    cmpdi r0, 0
-; CHECK-BE-NEXT:    beq cr0, .LBB11_2
-; CHECK-BE-NEXT:  # %bb.1:
-; CHECK-BE-NEXT:    neg r0, r0
-; CHECK-BE-NEXT:    stdux r30, r1, r0
+; CHECK-BE-NEXT:    sub r0, r1, r12
+; CHECK-BE-NEXT:    lis r12, -2
+; CHECK-BE-NEXT:    ori r12, r12, 32768
+; CHECK-BE-NEXT:    add r0, r12, r0
+; CHECK-BE-NEXT:    sub r12, r0, r1
+; CHECK-BE-NEXT:    cmpdi r12, -4096
+; CHECK-BE-NEXT:    bge cr0, .LBB11_2
+; CHECK-BE-NEXT:  .LBB11_1:
+; CHECK-BE-NEXT:    stdu r30, -4096(r1)
+; CHECK-BE-NEXT:    addi r12, r12, 4096
+; CHECK-BE-NEXT:    cmpdi r12, -4096
+; CHECK-BE-NEXT:    blt cr0, .LBB11_1
 ; CHECK-BE-NEXT:  .LBB11_2:
-; CHECK-BE-NEXT:    li r0, -4096
-; CHECK-BE-NEXT:    cmpd r1, r12
-; CHECK-BE-NEXT:    beq cr0, .LBB11_4
-; CHECK-BE-NEXT:  .LBB11_3:
-; CHECK-BE-NEXT:    stdux r30, r1, r0
-; CHECK-BE-NEXT:    cmpd r1, r12
-; CHECK-BE-NEXT:    bne cr0, .LBB11_3
-; CHECK-BE-NEXT:  .LBB11_4:
-; CHECK-BE-NEXT:    mr r12, r30
-; CHECK-BE-NEXT:    li r0, 24
-; CHECK-BE-NEXT:    mtctr r0
-; CHECK-BE-NEXT:  .LBB11_5:
-; CHECK-BE-NEXT:    stdu r12, -4096(r1)
-; CHECK-BE-NEXT:    bdnz .LBB11_5
-; CHECK-BE-NEXT:  # %bb.6:
-; CHECK-BE-NEXT:    .cfi_def_cfa_register r1
+; CHECK-BE-NEXT:    stdux r30, r1, r12
+; CHECK-BE-NEXT:    mr r0, r30
+; CHECK-BE-NEXT:    .cfi_def_cfa_register r0
 ; CHECK-BE-NEXT:    .cfi_def_cfa_register r30
 ; CHECK-BE-NEXT:    .cfi_offset r31, -8
 ; CHECK-BE-NEXT:    .cfi_offset r30, -16
@@ -946,12 +919,12 @@ define void @f11(i32 %vla_size, i64 %i) #0 {
 ; CHECK-BE-NEXT:    add r4, r1, r7
 ; CHECK-BE-NEXT:    stdux r3, r1, r5
 ; CHECK-BE-NEXT:    cmpd r1, r4
-; CHECK-BE-NEXT:    beq cr0, .LBB11_8
-; CHECK-BE-NEXT:  .LBB11_7:
+; CHECK-BE-NEXT:    beq cr0, .LBB11_4
+; CHECK-BE-NEXT:  .LBB11_3:
 ; CHECK-BE-NEXT:    stdu r3, -4096(r1)
 ; CHECK-BE-NEXT:    cmpd r1, r4
-; CHECK-BE-NEXT:    bne cr0, .LBB11_7
-; CHECK-BE-NEXT:  .LBB11_8:
+; CHECK-BE-NEXT:    bne cr0, .LBB11_3
+; CHECK-BE-NEXT:  .LBB11_4:
 ; CHECK-BE-NEXT:    addi r3, r1, -32768
 ; CHECK-BE-NEXT:    lbz r3, 0(r3)
 ; CHECK-BE-NEXT:    mr r1, r30
@@ -961,18 +934,24 @@ define void @f11(i32 %vla_size, i64 %i) #0 {
 ;
 ; CHECK-32-LABEL: f11:
 ; CHECK-32:       # %bb.0:
-; CHECK-32-NEXT:    mr r12, r1
-; CHECK-32-NEXT:    .cfi_def_cfa r12, 0
-; CHECK-32-NEXT:    clrlwi r0, r12, 17
-; CHECK-32-NEXT:    subc r1, r1, r0
-; CHECK-32-NEXT:    li r0, 24
-; CHECK-32-NEXT:    mtctr r0
-; CHECK-32-NEXT:  .LBB11_1:
-; CHECK-32-NEXT:    stwu r12, -4096(r1)
-; CHECK-32-NEXT:    bdnz .LBB11_1
-; CHECK-32-NEXT:  # %bb.2:
-; CHECK-32-NEXT:    .cfi_def_cfa_register r1
+; CHECK-32-NEXT:    clrlwi r12, r1, 17
 ; CHECK-32-NEXT:    sub r0, r1, r12
+; CHECK-32-NEXT:    lis r12, -2
+; CHECK-32-NEXT:    ori r12, r12, 32768
+; CHECK-32-NEXT:    add r0, r12, r0
+; CHECK-32-NEXT:    sub r12, r0, r1
+; CHECK-32-NEXT:    mr r0, r1
+; CHECK-32-NEXT:    cmpwi r12, -4096
+; CHECK-32-NEXT:    bge cr0, .LBB11_2
+; CHECK-32-NEXT:  .LBB11_1:
+; CHECK-32-NEXT:    stwu r0, -4096(r1)
+; CHECK-32-NEXT:    addi r12, r12, 4096
+; CHECK-32-NEXT:    cmpwi r12, -4096
+; CHECK-32-NEXT:    blt cr0, .LBB11_1
+; CHECK-32-NEXT:  .LBB11_2:
+; CHECK-32-NEXT:    stwux r0, r1, r12
+; CHECK-32-NEXT:    .cfi_def_cfa_register r0
+; CHECK-32-NEXT:    sub r0, r1, r0
 ; CHECK-32-NEXT:    sub r0, r1, r0
 ; CHECK-32-NEXT:    addic r0, r0, -4
 ; CHECK-32-NEXT:    stwx r31, 0, r0

From 0e164144c7081fc0a96a6bc20f1f25b3f9e7f517 Mon Sep 17 00:00:00 2001
From: Joseph Huber <jhuber6@vols.utk.edu>
Date: Mon, 7 Jun 2021 08:08:29 -0400
Subject: [PATCH 281/318] [OpenMP] Fix typo in libomptarge for the wrong
 environment variable

Summary:
There was a typo in libomptarget that told users to use LIBOMPTARGET_DEBUG
instead of LIBOMPTARGET_INFO.

(cherry picked from commit 0af4e74aef2eaddc17e1e92eb6d1102cdb5f8ff4)
---
 openmp/libomptarget/src/interface.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index b97676a6981b..fba97380a4b2 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -61,7 +61,7 @@ static void HandleTargetOutcome(bool success, ident_t *loc = nullptr) {
         for (auto &Device : PM->Devices)
           dumpTargetPointerMappings(loc, Device);
       else
-        FAILURE_MESSAGE("Run with LIBOMPTARGET_DEBUG=%d to dump host-target "
+        FAILURE_MESSAGE("Run with LIBOMPTARGET_INFO=%d to dump host-target "
                         "pointer mappings.\n",
                         OMP_INFOTYPE_DUMP_TABLE);
 

From c7d7ace46258b04aa4b5df08952bfebc6fc4ce94 Mon Sep 17 00:00:00 2001
From: Tim Wojtulewicz <timwoj@gmail.com>
Date: Fri, 5 Mar 2021 21:30:47 +0100
Subject: [PATCH 282/318] [clang-format] Rework Whitesmiths mode to use
 line-level values in UnwrappedLineParser

This commit removes the old way of handling Whitesmiths mode in favor of just setting the
levels during parsing and letting the formatter handle it from there. It requires a bit of
special-casing during the parsing, but ends up a bit cleaner than before. It also removes
some of switch/case unit tests that don't really make much sense when dealing with
Whitesmiths.

Differential Revision: https://reviews.llvm.org/D94500

(cherry picked from commit f7f9f94b2e2b4c714bac9036f6b73a3df42daaff)
---
 clang/docs/ReleaseNotes.rst                 |   3 +
 clang/lib/Format/UnwrappedLineFormatter.cpp |   7 --
 clang/lib/Format/UnwrappedLineParser.cpp    |  89 ++++++++++++-----
 clang/lib/Format/UnwrappedLineParser.h      |  12 ++-
 clang/unittests/Format/FormatTest.cpp       | 100 ++++++++++++++++++--
 5 files changed, 170 insertions(+), 41 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 4cc1b0b9d2cf..460a62734e90 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -389,6 +389,9 @@ clang-format
   ``AlignConsecutiveDeclarations`` and ``AlignConsecutiveMacros`` have been modified to allow
   alignment across empty lines and/or comments.
 
+- Support for Whitesmiths has been improved, with fixes for ``namespace`` blocks
+  and ``case`` blocks and labels.
+
 libclang
 --------
 
diff --git a/clang/lib/Format/UnwrappedLineFormatter.cpp b/clang/lib/Format/UnwrappedLineFormatter.cpp
index 5dd0ccdfa6fd..7d197310e65b 100644
--- a/clang/lib/Format/UnwrappedLineFormatter.cpp
+++ b/clang/lib/Format/UnwrappedLineFormatter.cpp
@@ -1281,13 +1281,6 @@ void UnwrappedLineFormatter::formatFirstToken(
   if (Newlines)
     Indent = NewlineIndent;
 
-  // If in Whitemsmiths mode, indent start and end of blocks
-  if (Style.BreakBeforeBraces == FormatStyle::BS_Whitesmiths) {
-    if (RootToken.isOneOf(tok::l_brace, tok::r_brace, tok::kw_case,
-                          tok::kw_default))
-      Indent += Style.IndentWidth;
-  }
-
   // Preprocessor directives get indented before the hash only if specified
   if (Style.IndentPPDirectives != FormatStyle::PPDIS_BeforeHash &&
       (Line.Type == LT_PreprocessorDirective ||
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index f689a6361a3a..bec18bd5d8df 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -579,17 +579,23 @@ size_t UnwrappedLineParser::computePPHash() const {
   return h;
 }
 
-void UnwrappedLineParser::parseBlock(bool MustBeDeclaration, bool AddLevel,
-                                     bool MunchSemi) {
+void UnwrappedLineParser::parseBlock(bool MustBeDeclaration, unsigned AddLevels,
+                                     bool MunchSemi,
+                                     bool UnindentWhitesmithsBraces) {
   assert(FormatTok->isOneOf(tok::l_brace, TT_MacroBlockBegin) &&
          "'{' or macro block token expected");
   const bool MacroBlock = FormatTok->is(TT_MacroBlockBegin);
   FormatTok->setBlockKind(BK_Block);
 
+  // For Whitesmiths mode, jump to the next level prior to skipping over the
+  // braces.
+  if (AddLevels > 0 && Style.BreakBeforeBraces == FormatStyle::BS_Whitesmiths)
+    ++Line->Level;
+
   size_t PPStartHash = computePPHash();
 
   unsigned InitialLevel = Line->Level;
-  nextToken(/*LevelDifference=*/AddLevel ? 1 : 0);
+  nextToken(/*LevelDifference=*/AddLevels);
 
   if (MacroBlock && FormatTok->is(tok::l_paren))
     parseParens();
@@ -602,10 +608,16 @@ void UnwrappedLineParser::parseBlock(bool MustBeDeclaration, bool AddLevel,
           ? (UnwrappedLine::kInvalidIndex)
           : (CurrentLines->size() - 1 - NbPreprocessorDirectives);
 
+  // Whitesmiths is weird here. The brace needs to be indented for the namespace
+  // block, but the block itself may not be indented depending on the style
+  // settings. This allows the format to back up one level in those cases.
+  if (UnindentWhitesmithsBraces)
+    --Line->Level;
+
   ScopedDeclarationState DeclarationState(*Line, DeclarationScopeStack,
                                           MustBeDeclaration);
-  if (AddLevel)
-    ++Line->Level;
+  if (AddLevels > 0u && Style.BreakBeforeBraces != FormatStyle::BS_Whitesmiths)
+    Line->Level += AddLevels;
   parseLevel(/*HasOpeningBrace=*/true);
 
   if (eof())
@@ -621,7 +633,7 @@ void UnwrappedLineParser::parseBlock(bool MustBeDeclaration, bool AddLevel,
   size_t PPEndHash = computePPHash();
 
   // Munch the closing brace.
-  nextToken(/*LevelDifference=*/AddLevel ? -1 : 0);
+  nextToken(/*LevelDifference=*/-AddLevels);
 
   if (MacroBlock && FormatTok->is(tok::l_paren))
     parseParens();
@@ -637,6 +649,7 @@ void UnwrappedLineParser::parseBlock(bool MustBeDeclaration, bool AddLevel,
     nextToken();
 
   Line->Level = InitialLevel;
+  FormatTok->setBlockKind(BK_Block);
 
   if (PPStartHash == PPEndHash) {
     Line->MatchingOpeningBlockLineIndex = OpeningLineIndex;
@@ -2128,15 +2141,34 @@ void UnwrappedLineParser::parseNamespace() {
     if (ShouldBreakBeforeBrace(Style, InitialToken))
       addUnwrappedLine();
 
-    bool AddLevel = Style.NamespaceIndentation == FormatStyle::NI_All ||
-                    (Style.NamespaceIndentation == FormatStyle::NI_Inner &&
-                     DeclarationScopeStack.size() > 1);
-    parseBlock(/*MustBeDeclaration=*/true, AddLevel);
+    unsigned AddLevels =
+        Style.NamespaceIndentation == FormatStyle::NI_All ||
+                (Style.NamespaceIndentation == FormatStyle::NI_Inner &&
+                 DeclarationScopeStack.size() > 1)
+            ? 1u
+            : 0u;
+    bool ManageWhitesmithsBraces =
+        AddLevels == 0u &&
+        Style.BreakBeforeBraces == FormatStyle::BS_Whitesmiths;
+
+    // If we're in Whitesmiths mode, indent the brace if we're not indenting
+    // the whole block.
+    if (ManageWhitesmithsBraces)
+      ++Line->Level;
+
+    parseBlock(/*MustBeDeclaration=*/true, AddLevels,
+               /*MunchSemi=*/true,
+               /*UnindentWhitesmithsBraces=*/ManageWhitesmithsBraces);
+
     // Munch the semicolon after a namespace. This is more common than one would
     // think. Putting the semicolon into its own line is very ugly.
     if (FormatTok->Tok.is(tok::semi))
       nextToken();
-    addUnwrappedLine();
+
+    addUnwrappedLine(AddLevels > 0 ? LineLevel::Remove : LineLevel::Keep);
+
+    if (ManageWhitesmithsBraces)
+      --Line->Level;
   }
   // FIXME: Add error handling.
 }
@@ -2222,6 +2254,11 @@ void UnwrappedLineParser::parseDoWhile() {
     return;
   }
 
+  // If in Whitesmiths mode, the line with the while() needs to be indented
+  // to the same level as the block.
+  if (Style.BreakBeforeBraces == FormatStyle::BS_Whitesmiths)
+    ++Line->Level;
+
   nextToken();
   parseStructuralElement();
 }
@@ -2234,25 +2271,19 @@ void UnwrappedLineParser::parseLabel(bool LeftAlignLabel) {
   if (LeftAlignLabel)
     Line->Level = 0;
 
-  bool RemoveWhitesmithsCaseIndent =
-      (!Style.IndentCaseBlocks &&
-       Style.BreakBeforeBraces == FormatStyle::BS_Whitesmiths);
-
-  if (RemoveWhitesmithsCaseIndent)
-    --Line->Level;
-
   if (!Style.IndentCaseBlocks && CommentsBeforeNextToken.empty() &&
       FormatTok->Tok.is(tok::l_brace)) {
 
-    CompoundStatementIndenter Indenter(
-        this, Line->Level, Style.BraceWrapping.AfterCaseLabel,
-        Style.BraceWrapping.IndentBraces || RemoveWhitesmithsCaseIndent);
+    CompoundStatementIndenter Indenter(this, Line->Level,
+                                       Style.BraceWrapping.AfterCaseLabel,
+                                       Style.BraceWrapping.IndentBraces);
     parseBlock(/*MustBeDeclaration=*/false);
     if (FormatTok->Tok.is(tok::kw_break)) {
       if (Style.BraceWrapping.AfterControlStatement ==
           FormatStyle::BWACS_Always) {
         addUnwrappedLine();
-        if (RemoveWhitesmithsCaseIndent) {
+        if (!Style.IndentCaseBlocks &&
+            Style.BreakBeforeBraces == FormatStyle::BS_Whitesmiths) {
           Line->Level++;
         }
       }
@@ -2920,17 +2951,29 @@ LLVM_ATTRIBUTE_UNUSED static void printDebugInfo(const UnwrappedLine &Line,
   llvm::dbgs() << "\n";
 }
 
-void UnwrappedLineParser::addUnwrappedLine() {
+void UnwrappedLineParser::addUnwrappedLine(LineLevel AdjustLevel) {
   if (Line->Tokens.empty())
     return;
   LLVM_DEBUG({
     if (CurrentLines == &Lines)
       printDebugInfo(*Line);
   });
+
+  // If this line closes a block when in Whitesmiths mode, remember that
+  // information so that the level can be decreased after the line is added.
+  // This has to happen after the addition of the line since the line itself
+  // needs to be indented.
+  bool ClosesWhitesmithsBlock =
+      Line->MatchingOpeningBlockLineIndex != UnwrappedLine::kInvalidIndex &&
+      Style.BreakBeforeBraces == FormatStyle::BS_Whitesmiths;
+
   CurrentLines->push_back(std::move(*Line));
   Line->Tokens.clear();
   Line->MatchingOpeningBlockLineIndex = UnwrappedLine::kInvalidIndex;
   Line->FirstStartColumn = 0;
+
+  if (ClosesWhitesmithsBlock && AdjustLevel == LineLevel::Remove)
+    --Line->Level;
   if (CurrentLines == &Lines && !PreprocessorDirectives.empty()) {
     CurrentLines->append(
         std::make_move_iterator(PreprocessorDirectives.begin()),
diff --git a/clang/lib/Format/UnwrappedLineParser.h b/clang/lib/Format/UnwrappedLineParser.h
index 02b328cb72de..ce135fac5e57 100644
--- a/clang/lib/Format/UnwrappedLineParser.h
+++ b/clang/lib/Format/UnwrappedLineParser.h
@@ -85,8 +85,9 @@ class UnwrappedLineParser {
   void reset();
   void parseFile();
   void parseLevel(bool HasOpeningBrace);
-  void parseBlock(bool MustBeDeclaration, bool AddLevel = true,
-                  bool MunchSemi = true);
+  void parseBlock(bool MustBeDeclaration, unsigned AddLevels = 1u,
+                  bool MunchSemi = true,
+                  bool UnindentWhitesmithsBraces = false);
   void parseChildBlock();
   void parsePPDirective();
   void parsePPDefine();
@@ -140,7 +141,12 @@ class UnwrappedLineParser {
   bool tryToParsePropertyAccessor();
   void tryToParseJSFunction();
   bool tryToParseSimpleAttribute();
-  void addUnwrappedLine();
+
+  // Used by addUnwrappedLine to denote whether to keep or remove a level
+  // when resetting the line state.
+  enum class LineLevel { Remove, Keep };
+
+  void addUnwrappedLine(LineLevel AdjustLevel = LineLevel::Remove);
   bool eof() const;
   // LevelDifference is the difference of levels after and before the current
   // token. For example:
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index ed26bb8a7150..45278fdb69e6 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -14741,6 +14741,7 @@ TEST_F(FormatTest, WhitesmithsBraceBreaking) {
                WhitesmithsBraceStyle);
   */
 
+  WhitesmithsBraceStyle.NamespaceIndentation = FormatStyle::NI_None;
   verifyFormat("namespace a\n"
                "  {\n"
                "class A\n"
@@ -14765,6 +14766,89 @@ TEST_F(FormatTest, WhitesmithsBraceBreaking) {
                "  } // namespace a",
                WhitesmithsBraceStyle);
 
+  verifyFormat("namespace a\n"
+               "  {\n"
+               "namespace b\n"
+               "  {\n"
+               "class A\n"
+               "  {\n"
+               "  void f()\n"
+               "    {\n"
+               "    if (true)\n"
+               "      {\n"
+               "      a();\n"
+               "      b();\n"
+               "      }\n"
+               "    }\n"
+               "  void g()\n"
+               "    {\n"
+               "    return;\n"
+               "    }\n"
+               "  };\n"
+               "struct B\n"
+               "  {\n"
+               "  int x;\n"
+               "  };\n"
+               "  } // namespace b\n"
+               "  } // namespace a",
+               WhitesmithsBraceStyle);
+
+  WhitesmithsBraceStyle.NamespaceIndentation = FormatStyle::NI_Inner;
+  verifyFormat("namespace a\n"
+               "  {\n"
+               "namespace b\n"
+               "  {\n"
+               "  class A\n"
+               "    {\n"
+               "    void f()\n"
+               "      {\n"
+               "      if (true)\n"
+               "        {\n"
+               "        a();\n"
+               "        b();\n"
+               "        }\n"
+               "      }\n"
+               "    void g()\n"
+               "      {\n"
+               "      return;\n"
+               "      }\n"
+               "    };\n"
+               "  struct B\n"
+               "    {\n"
+               "    int x;\n"
+               "    };\n"
+               "  } // namespace b\n"
+               "  } // namespace a",
+               WhitesmithsBraceStyle);
+
+  WhitesmithsBraceStyle.NamespaceIndentation = FormatStyle::NI_All;
+  verifyFormat("namespace a\n"
+               "  {\n"
+               "  namespace b\n"
+               "    {\n"
+               "    class A\n"
+               "      {\n"
+               "      void f()\n"
+               "        {\n"
+               "        if (true)\n"
+               "          {\n"
+               "          a();\n"
+               "          b();\n"
+               "          }\n"
+               "        }\n"
+               "      void g()\n"
+               "        {\n"
+               "        return;\n"
+               "        }\n"
+               "      };\n"
+               "    struct B\n"
+               "      {\n"
+               "      int x;\n"
+               "      };\n"
+               "    } // namespace b\n"
+               "  }   // namespace a",
+               WhitesmithsBraceStyle);
+
   verifyFormat("void f()\n"
                "  {\n"
                "  if (true)\n"
@@ -14799,7 +14883,7 @@ TEST_F(FormatTest, WhitesmithsBraceBreaking) {
                "  }\n",
                WhitesmithsBraceStyle);
 
-  WhitesmithsBraceStyle.IndentCaseBlocks = true;
+  WhitesmithsBraceStyle.IndentCaseLabels = true;
   verifyFormat("void switchTest1(int a)\n"
                "  {\n"
                "  switch (a)\n"
@@ -14807,7 +14891,7 @@ TEST_F(FormatTest, WhitesmithsBraceBreaking) {
                "    case 2:\n"
                "      {\n"
                "      }\n"
-               "    break;\n"
+               "      break;\n"
                "    }\n"
                "  }\n",
                WhitesmithsBraceStyle);
@@ -14817,7 +14901,7 @@ TEST_F(FormatTest, WhitesmithsBraceBreaking) {
                "  switch (a)\n"
                "    {\n"
                "    case 0:\n"
-               "    break;\n"
+               "      break;\n"
                "    case 1:\n"
                "      {\n"
                "      break;\n"
@@ -14825,9 +14909,9 @@ TEST_F(FormatTest, WhitesmithsBraceBreaking) {
                "    case 2:\n"
                "      {\n"
                "      }\n"
-               "    break;\n"
+               "      break;\n"
                "    default:\n"
-               "    break;\n"
+               "      break;\n"
                "    }\n"
                "  }\n",
                WhitesmithsBraceStyle);
@@ -14840,17 +14924,17 @@ TEST_F(FormatTest, WhitesmithsBraceBreaking) {
                "      {\n"
                "      foo(x);\n"
                "      }\n"
-               "    break;\n"
+               "      break;\n"
                "    default:\n"
                "      {\n"
                "      foo(1);\n"
                "      }\n"
-               "    break;\n"
+               "      break;\n"
                "    }\n"
                "  }\n",
                WhitesmithsBraceStyle);
 
-  WhitesmithsBraceStyle.IndentCaseBlocks = false;
+  WhitesmithsBraceStyle.IndentCaseLabels = false;
 
   verifyFormat("void switchTest4(int a)\n"
                "  {\n"

From f78f530bd38472f6bd058a0307484fc5edc57b7c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Thu, 10 Jun 2021 06:48:09 +0200
Subject: [PATCH 283/318] [llvm][PPC] Add missing case for 'I' asm memory
 operands
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

From https://llvm.org/docs/LangRef.html#asm-template-argument-modifiers:

I: Print the letter ‘i’ if the operand is an integer constant,
otherwise nothing. Used to print ‘addi’ vs ‘add’ instructions.

Differential Revision: https://reviews.llvm.org/D103968

(cherry picked from commit a9e4f91adf59bbc72541b96dd30245eaeeedf3ce)
---
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp   |  6 ++++++
 llvm/test/CodeGen/PowerPC/asm-template-I.ll | 19 +++++++++++++++++++
 2 files changed, 25 insertions(+)
 create mode 100644 llvm/test/CodeGen/PowerPC/asm-template-I.ll

diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index cce21f32414a..6257709731b9 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -321,6 +321,12 @@ bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
       O << "0, ";
       printOperand(MI, OpNo, O);
       return false;
+    case 'I':
+      // Write 'i' if an integer constant, otherwise nothing.  Used to print
+      // addi vs add, etc.
+      if (MI->getOperand(OpNo).isImm())
+        O << "i";
+      return false;
     case 'U': // Print 'u' for update form.
     case 'X': // Print 'x' for indexed form.
       // FIXME: Currently for PowerPC memory operands are always loaded
diff --git a/llvm/test/CodeGen/PowerPC/asm-template-I.ll b/llvm/test/CodeGen/PowerPC/asm-template-I.ll
new file mode 100644
index 000000000000..f77e6900efc0
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/asm-template-I.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64le-- | FileCheck %s
+; https://bugs.llvm.org/show_bug.cgi?id=50608
+
+define dso_local signext i32 @main(i32 signext %argc, i8** %argv) {
+; CHECK-LABEL: main:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    stw 3, -4(1)
+; CHECK-NEXT:    li 3, 0
+; CHECK-NEXT:    addi 4, 1, -4
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    .ascii "-1@0(4)"
+; CHECK-NEXT:    .byte 0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    blr
+entry:
+  call void asm sideeffect " .asciz \22${0:n}@${1:I}$1\22 ", "n,nZr"(i32 1, i32 %argc)
+  ret i32 0
+}

From e7dac564cd0ed9dee74ef972c46622743d90915d Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Tue, 15 Jun 2021 17:55:27 -0400
Subject: [PATCH 284/318] =?UTF-8?q?[=F0=9F=8D=92][libc++]=20Un-deprecate?=
 =?UTF-8?q?=20std::allocator<void>?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is a cherry-pick of 87784cc6fb3453a17e0e78 on 'main' for
backporting to LLVM 12.

Differential Revision: https://reviews.llvm.org/D104324
---
 libcxx/include/memory                         | 32 +++++------
 .../allocator_types.cxx2a.pass.cpp            | 23 ++++----
 .../PR50299.compile.pass.cpp                  | 20 +++++++
 .../default.allocator/allocator.ctor.pass.cpp | 44 ++++++++-------
 .../default.allocator/allocator.dtor.pass.cpp |  3 +-
 .../allocator_pointers.pass.cpp               |  4 +-
 ...cator_types.deprecated_in_cxx17.verify.cpp | 35 +++++++-----
 .../allocator_types.pass.cpp                  | 54 ++++++++++---------
 ...llocator_types.removed_in_cxx20.verify.cpp | 11 ++--
 .../allocator_types.void.compile.pass.cpp}    | 28 ++++------
 ...ocator_void.deprecated_in_cxx17.verify.cpp | 24 ---------
 11 files changed, 137 insertions(+), 141 deletions(-)
 create mode 100644 libcxx/test/std/utilities/memory/default.allocator/PR50299.compile.pass.cpp
 rename libcxx/test/{libcxx/depr/depr.default.allocator/allocator_void.cxx2a.pass.cpp => std/utilities/memory/default.allocator/allocator_types.void.compile.pass.cpp} (51%)
 delete mode 100644 libcxx/test/std/utilities/memory/default.allocator/allocator_void.deprecated_in_cxx17.verify.cpp

diff --git a/libcxx/include/memory b/libcxx/include/memory
index 39d0f5bee6a5..efb10c8fd25b 100644
--- a/libcxx/include/memory
+++ b/libcxx/include/memory
@@ -99,14 +99,14 @@ struct allocator_traits
 };
 
 template <>
-class allocator<void> // deprecated in C++17, removed in C++20
+class allocator<void> // removed in C++20
 {
 public:
-    typedef void*                                 pointer;
-    typedef const void*                           const_pointer;
-    typedef void                                  value_type;
+    typedef void*                                 pointer;          // deprecated in C++17
+    typedef const void*                           const_pointer;    // deprecated in C++17
+    typedef void                                  value_type;       // deprecated in C++17
 
-    template <class _Up> struct rebind {typedef allocator<_Up> other;};
+    template <class _Up> struct rebind {typedef allocator<_Up> other;}; // deprecated in C++17
 };
 
 template <class T>
@@ -786,27 +786,27 @@ to_address(const _Pointer& __p) _NOEXCEPT
 
 template <class _Tp> class allocator;
 
-#if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS)
+#if _LIBCPP_STD_VER <= 17
 template <>
-class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX17 allocator<void>
+class _LIBCPP_TEMPLATE_VIS allocator<void>
 {
 public:
-    typedef void*             pointer;
-    typedef const void*       const_pointer;
-    typedef void              value_type;
+    _LIBCPP_DEPRECATED_IN_CXX17 typedef void*             pointer;
+    _LIBCPP_DEPRECATED_IN_CXX17 typedef const void*       const_pointer;
+    _LIBCPP_DEPRECATED_IN_CXX17 typedef void              value_type;
 
-    template <class _Up> struct rebind {typedef allocator<_Up> other;};
+    template <class _Up> struct _LIBCPP_DEPRECATED_IN_CXX17 rebind {typedef allocator<_Up> other;};
 };
 
 template <>
-class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX17 allocator<const void>
+class _LIBCPP_TEMPLATE_VIS allocator<const void>
 {
 public:
-    typedef const void*       pointer;
-    typedef const void*       const_pointer;
-    typedef const void        value_type;
+    _LIBCPP_DEPRECATED_IN_CXX17 typedef const void*       pointer;
+    _LIBCPP_DEPRECATED_IN_CXX17 typedef const void*       const_pointer;
+    _LIBCPP_DEPRECATED_IN_CXX17 typedef const void        value_type;
 
-    template <class _Up> struct rebind {typedef allocator<_Up> other;};
+    template <class _Up> struct _LIBCPP_DEPRECATED_IN_CXX17 rebind {typedef allocator<_Up> other;};
 };
 #endif
 
diff --git a/libcxx/test/libcxx/depr/depr.default.allocator/allocator_types.cxx2a.pass.cpp b/libcxx/test/libcxx/depr/depr.default.allocator/allocator_types.cxx2a.pass.cpp
index bfff214127ba..a6134b04a8f5 100644
--- a/libcxx/test/libcxx/depr/depr.default.allocator/allocator_types.cxx2a.pass.cpp
+++ b/libcxx/test/libcxx/depr/depr.default.allocator/allocator_types.cxx2a.pass.cpp
@@ -33,18 +33,19 @@
 #include <type_traits>
 #include <cstddef>
 
-#include "test_macros.h"
-
-int main(int, char**)
-{
-    static_assert((std::is_same<std::allocator<char>::size_type, std::size_t>::value), "");
-    static_assert((std::is_same<std::allocator<char>::difference_type, std::ptrdiff_t>::value), "");
-    static_assert((std::is_same<std::allocator<char>::pointer, char*>::value), "");
-    static_assert((std::is_same<std::allocator<char>::const_pointer, const char*>::value), "");
-    static_assert((std::is_same<std::allocator<char>::reference, char&>::value), "");
-    static_assert((std::is_same<std::allocator<char>::const_reference, const char&>::value), "");
-    static_assert((std::is_same<std::allocator<char>::rebind<int>::other,
+template <class T>
+void test() {
+    static_assert((std::is_same<typename std::allocator<T>::size_type, std::size_t>::value), "");
+    static_assert((std::is_same<typename std::allocator<T>::difference_type, std::ptrdiff_t>::value), "");
+    static_assert((std::is_same<typename std::allocator<T>::pointer, T*>::value), "");
+    static_assert((std::is_same<typename std::allocator<T>::const_pointer, const T*>::value), "");
+    static_assert((std::is_same<typename std::allocator<T>::reference, T&>::value), "");
+    static_assert((std::is_same<typename std::allocator<T>::const_reference, const T&>::value), "");
+    static_assert((std::is_same<typename std::allocator<T>::template rebind<int>::other,
                                 std::allocator<int> >::value), "");
+}
 
+int main(int, char**) {
+    test<char>();
     return 0;
 }
diff --git a/libcxx/test/std/utilities/memory/default.allocator/PR50299.compile.pass.cpp b/libcxx/test/std/utilities/memory/default.allocator/PR50299.compile.pass.cpp
new file mode 100644
index 000000000000..245d3d9d320f
--- /dev/null
+++ b/libcxx/test/std/utilities/memory/default.allocator/PR50299.compile.pass.cpp
@@ -0,0 +1,20 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <memory>
+
+// Make sure we can use std::allocator<void> in all Standard modes. While the
+// explicit specialization for std::allocator<void> was deprecated, using that
+// specialization was neither deprecated nor removed (in C++20 it should simply
+// start using the primary template).
+//
+// See https://llvm.org/PR50299.
+
+#include <memory>
+
+std::allocator<void> a;
diff --git a/libcxx/test/std/utilities/memory/default.allocator/allocator.ctor.pass.cpp b/libcxx/test/std/utilities/memory/default.allocator/allocator.ctor.pass.cpp
index 28bfe41680b6..6e6ff1f2d134 100644
--- a/libcxx/test/std/utilities/memory/default.allocator/allocator.ctor.pass.cpp
+++ b/libcxx/test/std/utilities/memory/default.allocator/allocator.ctor.pass.cpp
@@ -7,15 +7,14 @@
 //===----------------------------------------------------------------------===//
 
 // <memory>
-// UNSUPPORTED: c++03, c++11, c++14, c++17
 //
 // template <class T>
 // class allocator
 // {
 // public: // All of these are constexpr after C++17
-//  constexpr allocator() noexcept;
-//  constexpr allocator(const allocator&) noexcept;
-//  template<class U> constexpr allocator(const allocator<U>&) noexcept;
+//  allocator() noexcept;
+//  allocator(const allocator&) noexcept;
+//  template<class U> allocator(const allocator<U>&) noexcept;
 // ...
 // };
 
@@ -24,28 +23,27 @@
 
 #include "test_macros.h"
 
+template<class T>
+TEST_CONSTEXPR_CXX20 bool test() {
+  typedef std::allocator<T> A1;
+  typedef std::allocator<long> A2;
 
-int main(int, char**)
-{
-    {
-    typedef std::allocator<char> AC;
-    typedef std::allocator<long> AL;
+  A1 a1;
+  A1 a1_copy = a1; (void)a1_copy;
+  A2 a2 = a1; (void)a2;
 
-    constexpr AC a1;
-    constexpr AC a2{a1};
-    constexpr AL a3{a2};
-    (void) a3;
-    }
-    {
-    typedef std::allocator<const char> AC;
-    typedef std::allocator<const long> AL;
-
-    constexpr AC a1;
-    constexpr AC a2{a1};
-    constexpr AL a3{a2};
-    (void) a3;
-    }
+  return true;
+}
 
+int main(int, char**) {
+  test<char>();
+  test<char const>();
+  test<void>();
 
+#if TEST_STD_VER > 17
+  static_assert(test<char>());
+  static_assert(test<char const>());
+  static_assert(test<void>());
+#endif
   return 0;
 }
diff --git a/libcxx/test/std/utilities/memory/default.allocator/allocator.dtor.pass.cpp b/libcxx/test/std/utilities/memory/default.allocator/allocator.dtor.pass.cpp
index 7ae87dd45353..a095ca102491 100644
--- a/libcxx/test/std/utilities/memory/default.allocator/allocator.dtor.pass.cpp
+++ b/libcxx/test/std/utilities/memory/default.allocator/allocator.dtor.pass.cpp
@@ -13,7 +13,6 @@
 
 #include <memory>
 
-
 template <typename T>
 constexpr bool test() {
     std::allocator<T> alloc;
@@ -26,11 +25,13 @@ constexpr bool test() {
 int main(int, char**)
 {
     test<int>();
+    test<void>();
 #ifdef _LIBCPP_VERSION // extension
     test<int const>();
 #endif // _LIBCPP_VERSION
 
     static_assert(test<int>());
+    static_assert(test<void>());
 #ifdef _LIBCPP_VERSION // extension
     static_assert(test<int const>());
 #endif // _LIBCPP_VERSION
diff --git a/libcxx/test/std/utilities/memory/default.allocator/allocator_pointers.pass.cpp b/libcxx/test/std/utilities/memory/default.allocator/allocator_pointers.pass.cpp
index 27e91a650e69..777e5dfc44f5 100644
--- a/libcxx/test/std/utilities/memory/default.allocator/allocator_pointers.pass.cpp
+++ b/libcxx/test/std/utilities/memory/default.allocator/allocator_pointers.pass.cpp
@@ -11,9 +11,9 @@
 #include <memory>
 #include <cassert>
 
-// #include <memory>
-
 #include "test_macros.h"
+
+// <memory>
 //
 // template <class Alloc>
 // struct allocator_traits
diff --git a/libcxx/test/std/utilities/memory/default.allocator/allocator_types.deprecated_in_cxx17.verify.cpp b/libcxx/test/std/utilities/memory/default.allocator/allocator_types.deprecated_in_cxx17.verify.cpp
index f09651a81eee..88af53a4eeaa 100644
--- a/libcxx/test/std/utilities/memory/default.allocator/allocator_types.deprecated_in_cxx17.verify.cpp
+++ b/libcxx/test/std/utilities/memory/default.allocator/allocator_types.deprecated_in_cxx17.verify.cpp
@@ -30,20 +30,27 @@
 // UNSUPPORTED: clang-6
 
 #include <memory>
-#include "test_macros.h"
 
-int main(int, char**)
-{
-    typedef std::allocator<char>::pointer AP;             // expected-warning {{'pointer' is deprecated}}
-    typedef std::allocator<char>::const_pointer ACP;      // expected-warning {{'const_pointer' is deprecated}}
-    typedef std::allocator<char>::reference AR;           // expected-warning {{'reference' is deprecated}}
-    typedef std::allocator<char>::const_reference ACR;    // expected-warning {{'const_reference' is deprecated}}
-    typedef std::allocator<char>::rebind<int>::other ARO; // expected-warning {{'rebind<int>' is deprecated}}
-
-    typedef std::allocator<char const>::pointer AP2;             // expected-warning {{'pointer' is deprecated}}
-    typedef std::allocator<char const>::const_pointer ACP2;      // expected-warning {{'const_pointer' is deprecated}}
-    typedef std::allocator<char const>::reference AR2;           // expected-warning {{'reference' is deprecated}}
-    typedef std::allocator<char const>::const_reference ACR2;    // expected-warning {{'const_reference' is deprecated}}
-    typedef std::allocator<char const>::rebind<int>::other ARO2; // expected-warning {{'rebind<int>' is deprecated}}
+int main(int, char**) {
+    {
+        typedef std::allocator<char>::pointer Pointer;                  // expected-warning {{'pointer' is deprecated}}
+        typedef std::allocator<char>::const_pointer ConstPointer;       // expected-warning {{'const_pointer' is deprecated}}
+        typedef std::allocator<char>::reference Reference;              // expected-warning {{'reference' is deprecated}}
+        typedef std::allocator<char>::const_reference ConstReference;   // expected-warning {{'const_reference' is deprecated}}
+        typedef std::allocator<char>::rebind<int>::other Rebind;        // expected-warning {{'rebind<int>' is deprecated}}
+    }
+    {
+        typedef std::allocator<char const>::pointer Pointer;                  // expected-warning {{'pointer' is deprecated}}
+        typedef std::allocator<char const>::const_pointer ConstPointer;       // expected-warning {{'const_pointer' is deprecated}}
+        typedef std::allocator<char const>::reference Reference;              // expected-warning {{'reference' is deprecated}}
+        typedef std::allocator<char const>::const_reference ConstReference;   // expected-warning {{'const_reference' is deprecated}}
+        typedef std::allocator<char const>::rebind<int>::other Rebind;        // expected-warning {{'rebind<int>' is deprecated}}
+    }
+    {
+        typedef std::allocator<void>::pointer Pointer;                  // expected-warning {{'pointer' is deprecated}}
+        typedef std::allocator<void>::const_pointer ConstPointer;       // expected-warning {{'const_pointer' is deprecated}}
+        // reference and const_reference are not provided by std::allocator<void>
+        typedef std::allocator<void>::rebind<int>::other Rebind;        // expected-warning {{'rebind<int>' is deprecated}}
+    }
     return 0;
 }
diff --git a/libcxx/test/std/utilities/memory/default.allocator/allocator_types.pass.cpp b/libcxx/test/std/utilities/memory/default.allocator/allocator_types.pass.cpp
index 6a034935a30d..74adc6943594 100644
--- a/libcxx/test/std/utilities/memory/default.allocator/allocator_types.pass.cpp
+++ b/libcxx/test/std/utilities/memory/default.allocator/allocator_types.pass.cpp
@@ -18,47 +18,49 @@
 //     typedef ptrdiff_t difference_type;
 //     typedef T         value_type;
 //
+//     typedef T*        pointer;           // deprecated in C++17, removed in C++20
+//     typedef T const*  const_pointer;     // deprecated in C++17, removed in C++20
+//     typedef T&        reference;         // deprecated in C++17, removed in C++20
+//     typedef T const&  const_reference;   // deprecated in C++17, removed in C++20
+//     template< class U > struct rebind { typedef allocator<U> other; }; // deprecated in C++17, removed in C++20
+//
 //     typedef true_type propagate_on_container_move_assignment;
 //     typedef true_type is_always_equal;
 // ...
 // };
 
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+
 #include <memory>
 #include <type_traits>
 #include <cstddef>
 
 #include "test_macros.h"
 
-template <typename T, typename U>
-TEST_CONSTEXPR_CXX20 bool test()
-{
-    static_assert((std::is_same<typename std::allocator<T>::size_type, std::size_t>::value), "");
-    static_assert((std::is_same<typename std::allocator<T>::difference_type, std::ptrdiff_t>::value), "");
-    static_assert((std::is_same<typename std::allocator<T>::value_type, T>::value), "");
-    static_assert((std::is_same<typename std::allocator<T>::propagate_on_container_move_assignment, std::true_type>::value), "");
-    static_assert((std::is_same<typename std::allocator<T>::is_always_equal, std::true_type>::value), "");
+struct U;
 
-    std::allocator<T> a;
-    std::allocator<T> a2 = a;
-    a2 = a;
-    std::allocator<U> a3 = a2;
-    (void)a3;
+template <typename T>
+void test() {
+    typedef std::allocator<T> Alloc;
+    static_assert((std::is_same<typename Alloc::size_type, std::size_t>::value), "");
+    static_assert((std::is_same<typename Alloc::difference_type, std::ptrdiff_t>::value), "");
+    static_assert((std::is_same<typename Alloc::value_type, T>::value), "");
+    static_assert((std::is_same<typename Alloc::propagate_on_container_move_assignment, std::true_type>::value), "");
+    static_assert((std::is_same<typename Alloc::is_always_equal, std::true_type>::value), "");
 
-    return true;
+#if TEST_STD_VER <= 17
+    static_assert((std::is_same<typename Alloc::pointer, T*>::value), "");
+    static_assert((std::is_same<typename Alloc::const_pointer, T const*>::value), "");
+    static_assert((std::is_same<typename Alloc::reference, T&>::value), "");
+    static_assert((std::is_same<typename Alloc::const_reference, T const&>::value), "");
+    static_assert((std::is_same<typename Alloc::template rebind<U>::other, std::allocator<U> >::value), "");
+#endif
 }
 
-int main(int, char**)
-{
-    test<char, int>();
-#ifdef _LIBCPP_VERSION // extension
-    test<char const, int const>();
-#endif // _LIBCPP_VERSION
-
-#if TEST_STD_VER > 17
-    static_assert(test<char, int>());
-#ifdef _LIBCPP_VERSION // extension
-    static_assert(test<char const, int const>());
-#endif // _LIBCPP_VERSION
+int main(int, char**) {
+    test<char>();
+#ifdef _LIBCPP_VERSION
+    test<char const>(); // extension
 #endif
     return 0;
 }
diff --git a/libcxx/test/std/utilities/memory/default.allocator/allocator_types.removed_in_cxx20.verify.cpp b/libcxx/test/std/utilities/memory/default.allocator/allocator_types.removed_in_cxx20.verify.cpp
index 9805accda20a..1d91a022867d 100644
--- a/libcxx/test/std/utilities/memory/default.allocator/allocator_types.removed_in_cxx20.verify.cpp
+++ b/libcxx/test/std/utilities/memory/default.allocator/allocator_types.removed_in_cxx20.verify.cpp
@@ -31,16 +31,17 @@
 template <typename T>
 void check()
 {
-    typedef typename std::allocator<T>::pointer AP;                      // expected-error 2 {{no type named 'pointer'}}
-    typedef typename std::allocator<T>::const_pointer ACP;               // expected-error 2 {{no type named 'const_pointer'}}
-    typedef typename std::allocator<T>::reference AR;                    // expected-error 2 {{no type named 'reference'}}
-    typedef typename std::allocator<T>::const_reference ACR;             // expected-error 2 {{no type named 'const_reference'}}
-    typedef typename std::allocator<T>::template rebind<int>::other ARO; // expected-error 2 {{no member named 'rebind'}}
+    typedef typename std::allocator<T>::pointer AP;                      // expected-error 3 {{no type named 'pointer'}}
+    typedef typename std::allocator<T>::const_pointer ACP;               // expected-error 3 {{no type named 'const_pointer'}}
+    typedef typename std::allocator<T>::reference AR;                    // expected-error 3 {{no type named 'reference'}}
+    typedef typename std::allocator<T>::const_reference ACR;             // expected-error 3 {{no type named 'const_reference'}}
+    typedef typename std::allocator<T>::template rebind<int>::other ARO; // expected-error 3 {{no member named 'rebind'}}
 }
 
 int main(int, char**)
 {
     check<char>();
     check<char const>();
+    check<void>();
     return 0;
 }
diff --git a/libcxx/test/libcxx/depr/depr.default.allocator/allocator_void.cxx2a.pass.cpp b/libcxx/test/std/utilities/memory/default.allocator/allocator_types.void.compile.pass.cpp
similarity index 51%
rename from libcxx/test/libcxx/depr/depr.default.allocator/allocator_void.cxx2a.pass.cpp
rename to libcxx/test/std/utilities/memory/default.allocator/allocator_types.void.compile.pass.cpp
index d03f90b506d5..d7b4a1b1bdf4 100644
--- a/libcxx/test/libcxx/depr/depr.default.allocator/allocator_void.cxx2a.pass.cpp
+++ b/libcxx/test/std/utilities/memory/default.allocator/allocator_types.void.compile.pass.cpp
@@ -6,10 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <memory>
+// Check that the nested types of std::allocator<void> are provided.
+// After C++17, those are not provided in the primary template and the
+// explicit specialization doesn't exist anymore, so this test is moot.
 
-// Check that the following member types of allocator<void> are provided
-// regardless of the Standard when we request them from libc++.
+// REQUIRES: c++03 || c++11 || c++14 || c++17
 
 // template <>
 // class allocator<void>
@@ -22,24 +23,13 @@
 //     template <class _Up> struct rebind {typedef allocator<_Up> other;};
 // };
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
 
 #include <memory>
 #include <type_traits>
 
-#include "test_macros.h"
-
-int main(int, char**)
-{
-    static_assert((std::is_same<std::allocator<void>::pointer, void*>::value), "");
-    static_assert((std::is_same<std::allocator<void>::const_pointer, const void*>::value), "");
-    static_assert((std::is_same<std::allocator<void>::value_type, void>::value), "");
-    static_assert((std::is_same<std::allocator<void>::rebind<int>::other,
-                                std::allocator<int> >::value), "");
-    std::allocator<void> a;
-    std::allocator<void> a2 = a;
-    a2 = a;
-
-  return 0;
-}
+static_assert((std::is_same<std::allocator<void>::pointer, void*>::value), "");
+static_assert((std::is_same<std::allocator<void>::const_pointer, const void*>::value), "");
+static_assert((std::is_same<std::allocator<void>::value_type, void>::value), "");
+static_assert((std::is_same<std::allocator<void>::rebind<int>::other,
+                            std::allocator<int> >::value), "");
diff --git a/libcxx/test/std/utilities/memory/default.allocator/allocator_void.deprecated_in_cxx17.verify.cpp b/libcxx/test/std/utilities/memory/default.allocator/allocator_void.deprecated_in_cxx17.verify.cpp
deleted file mode 100644
index cd98e6364b7e..000000000000
--- a/libcxx/test/std/utilities/memory/default.allocator/allocator_void.deprecated_in_cxx17.verify.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <memory>
-
-// Check that allocator<void> is deprecated in C++17.
-
-// REQUIRES: c++17
-
-#include <memory>
-#include "test_macros.h"
-
-int main(int, char**)
-{
-    typedef std::allocator<void>::pointer AP;             // expected-warning {{'allocator<void>' is deprecated}}
-    typedef std::allocator<void>::const_pointer ACP;      // expected-warning {{'allocator<void>' is deprecated}}
-    typedef std::allocator<void>::rebind<int>::other ARO; // expected-warning {{'allocator<void>' is deprecated}}
-    return 0;
-}

From 6412392511340a7f1793a00b5b501692300089e6 Mon Sep 17 00:00:00 2001
From: Arthur O'Dwyer <arthur.j.odwyer@gmail.com>
Date: Sat, 24 Apr 2021 08:20:13 -0400
Subject: [PATCH 285/318] =?UTF-8?q?[=F0=9F=8D=92][libc++]=20=5F=5Fbit=5Fit?=
 =?UTF-8?q?erator=20mustn't=20rely=20on=20deprecated=20SMF=20generation.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This allows us to turn -Wdeprecated-copy back on. We turned it off
in 3b71de41cc7c7 because Clang's implementation became more stringent
and started diagnosing the old code here.

Differential Revision: https://reviews.llvm.org/D101183

(cherry picked from commit 70d94c3f2cae71ade2ceacdceb3d2e9899d2289a)
---
 libcxx/include/__bit_reference | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/libcxx/include/__bit_reference b/libcxx/include/__bit_reference
index 9cfb4b84e653..d44ad03d3134 100644
--- a/libcxx/include/__bit_reference
+++ b/libcxx/include/__bit_reference
@@ -1114,28 +1114,26 @@ public:
 #endif
     {}
 
-    // avoid re-declaring a copy constructor for the non-const version.
-    using __type_for_copy_to_const =
-      _If<_IsConst, __bit_iterator<_Cp, false>, struct __private_nat>;
-
+    // When _IsConst=false, this is the copy constructor.
+    // It is non-trivial. Making it trivial would break ABI.
+    // When _IsConst=true, this is a converting constructor;
+    // the copy and move constructors are implicitly generated
+    // and trivial.
     _LIBCPP_INLINE_VISIBILITY
-    __bit_iterator(const __type_for_copy_to_const& __it) _NOEXCEPT
+    __bit_iterator(const __bit_iterator<_Cp, false>& __it) _NOEXCEPT
         : __seg_(__it.__seg_), __ctz_(__it.__ctz_) {}
 
-    // The non-const __bit_iterator has historically had a non-trivial
-    // copy constructor (as a quirk of its construction). We need to maintain
-    // this for ABI purposes.
-    using __type_for_abi_non_trivial_copy_ctor =
-      _If<!_IsConst, __bit_iterator, struct __private_nat>;
-
-    _LIBCPP_INLINE_VISIBILITY
-    __bit_iterator(__type_for_abi_non_trivial_copy_ctor const& __it) _NOEXCEPT
-      : __seg_(__it.__seg_), __ctz_(__it.__ctz_) {}
-
-    // Always declare the copy assignment operator since the implicit declaration
-    // is deprecated.
+    // When _IsConst=false, we have a user-provided copy constructor,
+    // so we must also provide a copy assignment operator because
+    // the implicit generation of a defaulted one is deprecated.
+    // When _IsConst=true, the assignment operators are
+    // implicitly generated and trivial.
     _LIBCPP_INLINE_VISIBILITY
-    __bit_iterator& operator=(__bit_iterator const&) = default;
+    __bit_iterator& operator=(const _If<_IsConst, struct __private_nat, __bit_iterator>& __it) {
+        __seg_ = __it.__seg_;
+        __ctz_ = __it.__ctz_;
+        return *this;
+    }
 
     _LIBCPP_INLINE_VISIBILITY reference operator*() const _NOEXCEPT
         {return reference(__seg_, __storage_type(1) << __ctz_);}

From 82796b3c3dab0a109958f3bfbef7f798c6c8d3e2 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Wed, 16 Jun 2021 13:30:36 -0400
Subject: [PATCH 286/318] [libc++] Adjust XFAIL for std::tuple deduction tests
 with GCC

GCC has been failing those tests, and we marked them as such in
a3ab5120fd572215afeac190757834a041dda73a on 'main'. Cherry-pick
only that part of the change into the LLVM 12 release so that we
can get the CI green again.
---
 .../std/utilities/tuple/tuple.tuple/tuple.cnstr/deduct.pass.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/deduct.pass.cpp b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/deduct.pass.cpp
index db0958ca6de1..4951ae22d406 100644
--- a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/deduct.pass.cpp
+++ b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/deduct.pass.cpp
@@ -13,7 +13,7 @@
 // GCC's implementation of class template deduction is still immature and runs
 // into issues with libc++. However GCC accepts this code when compiling
 // against libstdc++.
-// XFAIL: gcc-5, gcc-6, gcc-7
+// XFAIL: gcc-5, gcc-6, gcc-7, gcc-8, gcc-9, gcc-10, gcc-11
 
 // <tuple>
 

From 0d6fc8550b58d5993858de52e330e01e109ddff9 Mon Sep 17 00:00:00 2001
From: zoecarver <z.zoelec2@gmail.com>
Date: Wed, 26 May 2021 12:00:03 -0700
Subject: [PATCH 287/318] =?UTF-8?q?[=F0=9F=8D=92][libcxx][nfc]=20Fix=20the?=
 =?UTF-8?q?=20ASAN=20bots:=20update=20expected.pass.cpp.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ensures that `get_return_object`'s return type is the same as the return type for the function calling `co_return`. Otherwise, we try to construct an object, then free it, then return it.

Differential Revision: https://reviews.llvm.org/D103196

(cherry picked from commit 52123c96c016143ebfff6de76fe83cebd6c1d726)
---
 .../support.coroutines/end.to.end/expected.pass.cpp             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/test/std/experimental/language.support/support.coroutines/end.to.end/expected.pass.cpp b/libcxx/test/std/experimental/language.support/support.coroutines/end.to.end/expected.pass.cpp
index 2bd297660c4e..5b307508abeb 100644
--- a/libcxx/test/std/experimental/language.support/support.coroutines/end.to.end/expected.pass.cpp
+++ b/libcxx/test/std/experimental/language.support/support.coroutines/end.to.end/expected.pass.cpp
@@ -35,7 +35,7 @@ struct expected {
 
   struct promise_type {
     std::shared_ptr<Data> data;
-    std::shared_ptr<Data> get_return_object() { data = std::make_shared<Data>(); return data; }
+    expected get_return_object() { data = std::make_shared<Data>(); return {data}; }
     suspend_never initial_suspend() { return {}; }
     suspend_never final_suspend() noexcept { return {}; }
     void return_value(T v) { data->val = v; data->error = {}; }

From 0193a7da8bdaa9ffcc5bdefd5516c162bb26ab6b Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulsson@linux.vnet.ibm.com>
Date: Thu, 25 Feb 2021 16:05:43 -0600
Subject: [PATCH 288/318] [SystemZ]  Assign the full space for promoted and
 split outgoing args.

When a large "irregular" (e.g. i96) integer call argument is converted to
indirect, 64-bit parts are stored to the stack. The full stack space
(e.g. i128) was not allocated prior to this patch, but rather just the exact
space of the original type. This caused neighboring values on the stack to be
overwritten.

Thanks to Josh Stone for reporting this.

Review: Ulrich Weigand
Fixes https://bugs.llvm.org/show_bug.cgi?id=49322
Differential Revision: https://reviews.llvm.org/D97514

(cherry picked from commit 52bbbf4d4459239e0f461bc302ada89e2c5d07fc)
---
 .../Target/SystemZ/SystemZISelLowering.cpp    | 22 ++++++--
 llvm/test/CodeGen/SystemZ/args-11.ll          | 54 +++++++++++++++++++
 2 files changed, 72 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/SystemZ/args-11.ll

diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 9ace36f344a5..270134d84c61 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1550,6 +1550,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
   bool IsVarArg = CLI.IsVarArg;
   MachineFunction &MF = DAG.getMachineFunction();
   EVT PtrVT = getPointerTy(MF.getDataLayout());
+  LLVMContext &Ctx = *DAG.getContext();
 
   // Detect unsupported vector argument and return types.
   if (Subtarget.hasVector()) {
@@ -1559,7 +1560,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Analyze the operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  SystemZCCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+  SystemZCCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, Ctx);
   ArgCCInfo.AnalyzeCallOperands(Outs, CC_SystemZ);
 
   // We don't support GuaranteedTailCallOpt, only automatically-detected
@@ -1584,14 +1585,25 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
     if (VA.getLocInfo() == CCValAssign::Indirect) {
       // Store the argument in a stack slot and pass its address.
-      SDValue SpillSlot = DAG.CreateStackTemporary(Outs[I].ArgVT);
+      unsigned ArgIndex = Outs[I].OrigArgIndex;
+      EVT SlotVT;
+      if (I + 1 != E && Outs[I + 1].OrigArgIndex == ArgIndex) {
+        // Allocate the full stack space for a promoted (and split) argument.
+        Type *OrigArgType = CLI.Args[Outs[I].OrigArgIndex].Ty;
+        EVT OrigArgVT = getValueType(MF.getDataLayout(), OrigArgType);
+        MVT PartVT = getRegisterTypeForCallingConv(Ctx, CLI.CallConv, OrigArgVT);
+        unsigned N = getNumRegistersForCallingConv(Ctx, CLI.CallConv, OrigArgVT);
+        SlotVT = EVT::getIntegerVT(Ctx, PartVT.getSizeInBits() * N);
+      } else {
+        SlotVT = Outs[I].ArgVT;
+      }
+      SDValue SpillSlot = DAG.CreateStackTemporary(SlotVT);
       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
       MemOpChains.push_back(
           DAG.getStore(Chain, DL, ArgValue, SpillSlot,
                        MachinePointerInfo::getFixedStack(MF, FI)));
       // If the original argument was split (e.g. i128), we need
       // to store all parts of it here (and pass just one address).
-      unsigned ArgIndex = Outs[I].OrigArgIndex;
       assert (Outs[I].PartOffset == 0);
       while (I + 1 != E && Outs[I + 1].OrigArgIndex == ArgIndex) {
         SDValue PartValue = OutVals[I + 1];
@@ -1601,6 +1613,8 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
         MemOpChains.push_back(
             DAG.getStore(Chain, DL, PartValue, Address,
                          MachinePointerInfo::getFixedStack(MF, FI)));
+        assert((PartOffset + PartValue.getValueType().getStoreSize() <=
+                SlotVT.getStoreSize()) && "Not enough space for argument part!");
         ++I;
       }
       ArgValue = SpillSlot;
@@ -1694,7 +1708,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RetLocs;
-  CCState RetCCInfo(CallConv, IsVarArg, MF, RetLocs, *DAG.getContext());
+  CCState RetCCInfo(CallConv, IsVarArg, MF, RetLocs, Ctx);
   RetCCInfo.AnalyzeCallResult(Ins, RetCC_SystemZ);
 
   // Copy all of the result registers out of their specified physreg.
diff --git a/llvm/test/CodeGen/SystemZ/args-11.ll b/llvm/test/CodeGen/SystemZ/args-11.ll
new file mode 100644
index 000000000000..b355f9d6da15
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/args-11.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; Test outgoing promoted arguments that are split (and passed by reference).
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; The i96 arg is promoted to i128 and should get the full stack space.
+declare void @fn1(i96)
+define i32 @fn2() {
+; CHECK-LABEL: fn2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT:    .cfi_offset %r14, -48
+; CHECK-NEXT:    .cfi_offset %r15, -40
+; CHECK-NEXT:    aghi %r15, -184
+; CHECK-NEXT:    .cfi_def_cfa_offset 344
+; CHECK-NEXT:    mvhi 180(%r15), -1
+; CHECK-NEXT:    mvghi 168(%r15), 0
+; CHECK-NEXT:    la %r2, 160(%r15)
+; CHECK-NEXT:    mvghi 160(%r15), 0
+; CHECK-NEXT:    brasl %r14, fn1@PLT
+; CHECK-NEXT:    l %r2, 180(%r15)
+; CHECK-NEXT:    lmg %r14, %r15, 296(%r15)
+; CHECK-NEXT:    br %r14
+  %1 = alloca i32
+  store i32 -1, i32* %1
+  call void @fn1(i96 0)
+  %2 = load i32, i32* %1
+  ret i32 %2
+}
+
+declare void @fn3(i136)
+define i32 @fn4() {
+; CHECK-LABEL: fn4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT:    .cfi_offset %r14, -48
+; CHECK-NEXT:    .cfi_offset %r15, -40
+; CHECK-NEXT:    aghi %r15, -192
+; CHECK-NEXT:    .cfi_def_cfa_offset 352
+; CHECK-NEXT:    mvhi 188(%r15), -1
+; CHECK-NEXT:    mvghi 176(%r15), 0
+; CHECK-NEXT:    mvghi 168(%r15), 0
+; CHECK-NEXT:    la %r2, 160(%r15)
+; CHECK-NEXT:    mvghi 160(%r15), 0
+; CHECK-NEXT:    brasl %r14, fn3@PLT
+; CHECK-NEXT:    l %r2, 188(%r15)
+; CHECK-NEXT:    lmg %r14, %r15, 304(%r15)
+; CHECK-NEXT:    br %r14
+  %1 = alloca i32
+  store i32 -1, i32* %1
+  call void @fn3(i136 0)
+  %2 = load i32, i32* %1
+  ret i32 %2
+}

From 9be9215b27196c52177c4fc6edebd31a8f4b7e49 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sguelton@redhat.com>
Date: Fri, 11 Jun 2021 15:42:26 +0200
Subject: [PATCH 289/318] AMD k8 family does not support SSE4.x which are
 required by x86-64-v2+

So don't define __tune__k8__ for these micro architecture.

SSE, SSE2 and SSE3 appear in https://www.amd.com/system/files/TechDocs/25112.PDF
but not SSE4.x.

Differential Revision: https://reviews.llvm.org/D104116

(cherry picked from commit 092c303955cd18be6c0b923b1c0a1b96e2c91893)
---
 clang/lib/Basic/Targets/X86.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index 694a8095e336..80e160bd9190 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -513,9 +513,6 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
   case CK_K8:
   case CK_K8SSE3:
   case CK_x86_64:
-  case CK_x86_64_v2:
-  case CK_x86_64_v3:
-  case CK_x86_64_v4:
     defineCPUMacros(Builder, "k8");
     break;
   case CK_AMDFAM10:

From 051126fe6ffe0f7c27b260ad29ecc84eb34fac0f Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Mon, 14 Jun 2021 11:51:00 +0200
Subject: [PATCH 290/318] Fix -Wswitch warning after
 092c303955cd18be6c0b923b1c0a1b96e2c91893.

(cherry picked from commit a83ef21ff82e4283044fd31470fc6c1bc4b99c51)
---
 clang/lib/Basic/Targets/X86.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index 80e160bd9190..c5ad1c7d2c2e 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -515,6 +515,10 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
   case CK_x86_64:
     defineCPUMacros(Builder, "k8");
     break;
+  case CK_x86_64_v2:
+  case CK_x86_64_v3:
+  case CK_x86_64_v4:
+    break;
   case CK_AMDFAM10:
     defineCPUMacros(Builder, "amdfam10");
     break;

From cc08a27d2ecc1458a8871c989add0b49afc24f12 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Fri, 16 Apr 2021 09:50:24 -0700
Subject: [PATCH 291/318] Sanitizer built against glibc 2.34 doesn't work

As mentioned in https://gcc.gnu.org/PR100114 , glibc starting with the
https://sourceware.org/git/?p=glibc.git;a=commit;h=6c57d320484988e87e446e2e60ce42816bf51d53
change doesn't define SIGSTKSZ and MINSIGSTKSZ macros to constants, but to sysconf function call.
sanitizer_posix_libcdep.cpp has
static const uptr kAltStackSize = SIGSTKSZ * 4;  // SIGSTKSZ is not enough.
which is generally fine, just means that when SIGSTKSZ is not a compile time constant will be initialized later.
The problem is that kAltStackSize is used in SetAlternateSignalStack which is called very early, from .preinit_array
initialization, i.e. far before file scope variables are constructed, which means it is not initialized and
mmapping 0 will fail:
==145==ERROR: AddressSanitizer failed to allocate 0x0 (0) bytes of SetAlternateSignalStack (error code: 22)

Here is one possible fix, another one could be to make kAltStackSize a preprocessor macro if _SG_SIGSTKSZ is defined
(but perhaps with having an automatic const variable initialized to it so that sysconf isn't at least called twice
during SetAlternateSignalStack.

Reviewed By: vitalybuka

Differential Revision: https://reviews.llvm.org/D100645

(cherry picked from commit 82150606fb11d28813ae6da1101f5bda638165fe)
---
 .../lib/sanitizer_common/sanitizer_posix_libcdep.cpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp
index d29438cf9dbd..2b10bdd37293 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp
@@ -165,7 +165,11 @@ bool SupportsColoredOutput(fd_t fd) {
 
 #if !SANITIZER_GO
 // TODO(glider): different tools may require different altstack size.
-static const uptr kAltStackSize = SIGSTKSZ * 4;  // SIGSTKSZ is not enough.
+static uptr GetAltStackSize() {
+  // SIGSTKSZ is not enough.
+  static const uptr kAltStackSize = SIGSTKSZ * 4;
+  return kAltStackSize;
+}
 
 void SetAlternateSignalStack() {
   stack_t altstack, oldstack;
@@ -176,10 +180,10 @@ void SetAlternateSignalStack() {
   // TODO(glider): the mapped stack should have the MAP_STACK flag in the
   // future. It is not required by man 2 sigaltstack now (they're using
   // malloc()).
-  void* base = MmapOrDie(kAltStackSize, __func__);
+  void *base = MmapOrDie(GetAltStackSize(), __func__);
   altstack.ss_sp = (char*) base;
   altstack.ss_flags = 0;
-  altstack.ss_size = kAltStackSize;
+  altstack.ss_size = GetAltStackSize();
   CHECK_EQ(0, sigaltstack(&altstack, nullptr));
 }
 
@@ -187,7 +191,7 @@ void UnsetAlternateSignalStack() {
   stack_t altstack, oldstack;
   altstack.ss_sp = nullptr;
   altstack.ss_flags = SS_DISABLE;
-  altstack.ss_size = kAltStackSize;  // Some sane value required on Darwin.
+  altstack.ss_size = GetAltStackSize();  // Some sane value required on Darwin.
   CHECK_EQ(0, sigaltstack(&altstack, &oldstack));
   UnmapOrDie(oldstack.ss_sp, oldstack.ss_size);
 }

From 385a6f37fefbeb1397f1c3733f328bb2b0403e2b Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sguelton@redhat.com>
Date: Fri, 14 May 2021 11:45:10 +0200
Subject: [PATCH 292/318] Prevent generation of dependency on _cxa_guard for
 static initialization

This fixes an issue introduced by https://reviews.llvm.org/D70662

Function-scope static initialization are guarded in C++, so we should probably
not use it because it introduces a dependency on __cxa_guard* symbols.
In the context of clang, libasan is linked statically, and it currently needs to
the odd situation where compiling C code with clang and asan requires -lstdc++

Differential Revision: https://reviews.llvm.org/D102475

(cherry picked from commit 414482751452e54710f16bae58458c66298aaf69)
---
 .../lib/sanitizer_common/sanitizer_posix_libcdep.cpp       | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp
index 2b10bdd37293..12603da1750d 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp
@@ -166,9 +166,10 @@ bool SupportsColoredOutput(fd_t fd) {
 #if !SANITIZER_GO
 // TODO(glider): different tools may require different altstack size.
 static uptr GetAltStackSize() {
-  // SIGSTKSZ is not enough.
-  static const uptr kAltStackSize = SIGSTKSZ * 4;
-  return kAltStackSize;
+  // Note: since GLIBC_2.31, SIGSTKSZ may be a function call, so this may be
+  // more costly that you think. However GetAltStackSize is only call 2-3 times
+  // per thread so don't cache the evaluation.
+  return SIGSTKSZ * 4;
 }
 
 void SetAlternateSignalStack() {

From b690ec54817d9f4ad0e6e16f88a12044660b794f Mon Sep 17 00:00:00 2001
From: Joachim Meyer <joachim@joameyer.de>
Date: Tue, 8 Jun 2021 18:16:08 +0200
Subject: [PATCH 293/318] [LV] Parallel annotated loop does not imply all loads
 can be hoisted.

As noted in https://bugs.llvm.org/show_bug.cgi?id=46666, the current behavior of assuming if-conversion safety if a loop is annotated parallel (`!llvm.loop.parallel_accesses`), is not expectable, the documentation for this behavior was since removed from the LangRef again, and can lead to invalid reads.
This was observed in POCL (https://github.com/pocl/pocl/issues/757) and would require similar workarounds in current work at hipSYCL.

The question remains why this was initially added and what the implications of removing this optimization would be.
Do we need an alternative mechanism to propagate the information about legality of if-conversion?
Or is the idea that conditional loads in `#pragma clang loop vectorize(assume_safety)` can be executed unmasked without additional checks flawed in general?
I think this implication is not part of what a user of that pragma (and corresponding metadata) would expect and thus dangerous.

Only two additional tests failed, which are adapted in this patch. Depending on the further direction force-ifcvt.ll should be removed or further adapted.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D103907

(cherry picked from commit 4f01122c3f6c70beee8f736f196a09976602685f)
---
 .../Vectorize/LoopVectorizationLegality.h     | 15 +++----
 .../Vectorize/LoopVectorizationLegality.cpp   | 14 ++-----
 .../LoopVectorize/X86/force-ifcvt.ll          | 42 -------------------
 .../X86/tail_folding_and_assume_safety.ll     |  4 +-
 4 files changed, 10 insertions(+), 65 deletions(-)
 delete mode 100644 llvm/test/Transforms/LoopVectorize/X86/force-ifcvt.ll

diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 2f80b4373b46..246db0fd2dd9 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -396,22 +396,17 @@ class LoopVectorizationLegality {
   bool canVectorizeOuterLoop();
 
   /// Return true if all of the instructions in the block can be speculatively
-  /// executed, and record the loads/stores that require masking. If's that
-  /// guard loads can be ignored under "assume safety" unless \p PreserveGuards
-  /// is true. This can happen when we introduces guards for which the original
-  /// "unguarded-loads are safe" assumption does not hold. For example, the
-  /// vectorizer's fold-tail transformation changes the loop to execute beyond
-  /// its original trip-count, under a proper guard, which should be preserved.
+  /// executed, and record the loads/stores that require masking.
   /// \p SafePtrs is a list of addresses that are known to be legal and we know
   /// that we can read from them without segfault.
   /// \p MaskedOp is a list of instructions that have to be transformed into
   /// calls to the appropriate masked intrinsic when the loop is vectorized.
   /// \p ConditionalAssumes is a list of assume instructions in predicated
   /// blocks that must be dropped if the CFG gets flattened.
-  bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs,
-                            SmallPtrSetImpl<const Instruction *> &MaskedOp,
-                            SmallPtrSetImpl<Instruction *> &ConditionalAssumes,
-                            bool PreserveGuards = false) const;
+  bool blockCanBePredicated(
+      BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs,
+      SmallPtrSetImpl<const Instruction *> &MaskedOp,
+      SmallPtrSetImpl<Instruction *> &ConditionalAssumes) const;
 
   /// Updates the vectorization state by adding \p Phi to the inductions list.
   /// This can set \p Phi as the main induction of the loop if \p Phi is a
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 2ab0848193f6..b8c21a0e1cd3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -925,10 +925,7 @@ bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
 bool LoopVectorizationLegality::blockCanBePredicated(
     BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs,
     SmallPtrSetImpl<const Instruction *> &MaskedOp,
-    SmallPtrSetImpl<Instruction *> &ConditionalAssumes,
-    bool PreserveGuards) const {
-  const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
-
+    SmallPtrSetImpl<Instruction *> &ConditionalAssumes) const {
   for (Instruction &I : *BB) {
     // Check that we don't have a constant expression that can trap as operand.
     for (Value *Operand : I.operands()) {
@@ -956,11 +953,7 @@ bool LoopVectorizationLegality::blockCanBePredicated(
       if (!LI)
         return false;
       if (!SafePtrs.count(LI->getPointerOperand())) {
-        // !llvm.mem.parallel_loop_access implies if-conversion safety.
-        // Otherwise, record that the load needs (real or emulated) masking
-        // and let the cost model decide.
-        if (!IsAnnotatedParallel || PreserveGuards)
-          MaskedOp.insert(LI);
+        MaskedOp.insert(LI);
         continue;
       }
     }
@@ -1276,8 +1269,7 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
   // do not need predication such as the header block.
   for (BasicBlock *BB : TheLoop->blocks()) {
     if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp,
-                              TmpConditionalAssumes,
-                              /* MaskAllLoads= */ true)) {
+                              TmpConditionalAssumes)) {
       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as requested.\n");
       return false;
     }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/force-ifcvt.ll b/llvm/test/Transforms/LoopVectorize/X86/force-ifcvt.ll
deleted file mode 100644
index 07b98b4fb001..000000000000
--- a/llvm/test/Transforms/LoopVectorize/X86/force-ifcvt.ll
+++ /dev/null
@@ -1,42 +0,0 @@
-; RUN: opt -loop-vectorize -S < %s | FileCheck %s
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; Function Attrs: norecurse nounwind uwtable
-define void @Test(i32* nocapture %res, i32* nocapture readnone %c, i32* nocapture readonly %d, i32* nocapture readonly %p) #0 {
-entry:
-  br label %for.body
-
-; CHECK-LABEL: @Test
-; CHECK: <4 x i32>
-
-for.body:                                         ; preds = %cond.end, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %cond.end ]
-  %arrayidx = getelementptr inbounds i32, i32* %p, i64 %indvars.iv
-  %0 = load i32, i32* %arrayidx, align 4, !llvm.access.group !1
-  %cmp1 = icmp eq i32 %0, 0
-  %arrayidx3 = getelementptr inbounds i32, i32* %res, i64 %indvars.iv
-  %1 = load i32, i32* %arrayidx3, align 4, !llvm.access.group !1
-  br i1 %cmp1, label %cond.end, label %cond.false
-
-cond.false:                                       ; preds = %for.body
-  %arrayidx7 = getelementptr inbounds i32, i32* %d, i64 %indvars.iv
-  %2 = load i32, i32* %arrayidx7, align 4, !llvm.access.group !1
-  %add = add nsw i32 %2, %1
-  br label %cond.end
-
-cond.end:                                         ; preds = %for.body, %cond.false
-  %cond = phi i32 [ %add, %cond.false ], [ %1, %for.body ]
-  store i32 %cond, i32* %arrayidx3, align 4, !llvm.access.group !1
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 16
-  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
-
-for.end:                                          ; preds = %cond.end
-  ret void
-}
-
-attributes #0 = { norecurse nounwind uwtable "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" }
-
-!0 = distinct !{!0, !{!"llvm.loop.parallel_accesses", !1}}
-!1 = distinct !{}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll
index 10477ddce9c9..5f9c477746db 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll
@@ -51,7 +51,7 @@ for.inc:
   br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !8
 }
 
-; Case2: With pragma assume_safety only the store is masked.
+; Case2: With pragma assume_safety both, load and store are masked.
 ; void assume_safety(int * p, int * q1, int * q2, int guard) {
 ;   #pragma clang loop vectorize(assume_safety)
 ;   for(int ix=0; ix < 1021; ++ix) {
@@ -63,7 +63,7 @@ for.inc:
 
 ;CHECK-LABEL: @assume_safety
 ;CHECK: vector.body:
-;CHECK-NOT: @llvm.masked.load
+;CHECK:  call <8 x i32> @llvm.masked.load
 ;CHECK:  call void @llvm.masked.store
 
 ; Function Attrs: norecurse nounwind uwtable

From 275ffa580880f6e18bf9742cad8e5dcab67b1f1d Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Wed, 16 Jun 2021 12:35:00 -0400
Subject: [PATCH 294/318] [libc++] Make sure std::allocator<void> is always
 trivial

When we removed the allocator<void> specialization, the triviality of
std::allocator<void> changed because the primary template had a
non-trivial default constructor and the specialization didn't
(so std::allocator<void> went from trivial to non-trivial).

This commit fixes that oversight by giving a trivial constructor to
the primary template when instantiated on cv-void.

This was reported in https://llvm.org/PR50299.

(cherry picked from commit 71e4d434dc83b02a853712a5cb026ee2fa9ba67f)

Differential Revision: https://reviews.llvm.org/D104486
---
 libcxx/include/memory                         | 30 ++++++++++++++--
 .../allocator_void.trivial.compile.pass.cpp   | 34 +++++++++++++++++++
 2 files changed, 62 insertions(+), 2 deletions(-)
 create mode 100644 libcxx/test/libcxx/memory/allocator_void.trivial.compile.pass.cpp

diff --git a/libcxx/include/memory b/libcxx/include/memory
index efb10c8fd25b..e02846b4035c 100644
--- a/libcxx/include/memory
+++ b/libcxx/include/memory
@@ -810,10 +810,35 @@ public:
 };
 #endif
 
+// This class provides a non-trivial default constructor to the class that derives from it
+// if the condition is satisfied.
+//
+// The second template parameter exists to allow giving a unique type to __non_trivial_if,
+// which makes it possible to avoid breaking the ABI when making this a base class of an
+// existing class. Without that, imagine we have classes D1 and D2, both of which used to
+// have no base classes, but which now derive from __non_trivial_if. The layout of a class
+// that inherits from both D1 and D2 will change because the two __non_trivial_if base
+// classes are not allowed to share the same address.
+//
+// By making those __non_trivial_if base classes unique, we work around this problem and
+// it is safe to start deriving from __non_trivial_if in existing classes.
+template <bool _Cond, class _Unique>
+struct __non_trivial_if { };
+
+template <class _Unique>
+struct __non_trivial_if<true, _Unique> {
+    _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_CONSTEXPR __non_trivial_if() _NOEXCEPT { }
+};
+
 // allocator
+//
+// Note: For ABI compatibility between C++20 and previous standards, we make
+//       allocator<void> trivial in C++20.
 
 template <class _Tp>
 class _LIBCPP_TEMPLATE_VIS allocator
+    : private __non_trivial_if<!is_void<_Tp>::value, allocator<_Tp> >
 {
 public:
     typedef size_t      size_type;
@@ -823,7 +848,7 @@ public:
     typedef true_type   is_always_equal;
 
     _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
-    allocator() _NOEXCEPT { }
+    allocator() _NOEXCEPT _LIBCPP_DEFAULT
 
     template <class _Up>
     _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
@@ -895,6 +920,7 @@ public:
 
 template <class _Tp>
 class _LIBCPP_TEMPLATE_VIS allocator<const _Tp>
+    : private __non_trivial_if<!is_void<_Tp>::value, allocator<const _Tp> >
 {
 public:
     typedef size_t      size_type;
@@ -904,7 +930,7 @@ public:
     typedef true_type   is_always_equal;
 
     _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
-    allocator() _NOEXCEPT { }
+    allocator() _NOEXCEPT _LIBCPP_DEFAULT
 
     template <class _Up>
     _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
diff --git a/libcxx/test/libcxx/memory/allocator_void.trivial.compile.pass.cpp b/libcxx/test/libcxx/memory/allocator_void.trivial.compile.pass.cpp
new file mode 100644
index 000000000000..f9d67c065de8
--- /dev/null
+++ b/libcxx/test/libcxx/memory/allocator_void.trivial.compile.pass.cpp
@@ -0,0 +1,34 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Make sure that std::allocator<void> is trivial. This was the case before C++20
+// with the std::allocator<void> explicit specialization, and this test makes sure
+// that we maintain that property across all standards.
+//
+// This is important since triviality has implications on how the type is passed
+// as a function argument in the ABI.
+
+#include <memory>
+#include <type_traits>
+
+typedef std::allocator<void> A1;
+typedef std::allocator<void const> A2;
+struct A3 : std::allocator<void> { };
+struct A4 : std::allocator<void const> { };
+
+static_assert(std::is_trivially_default_constructible<A1>::value, "");
+static_assert(std::is_trivial<A1>::value, "");
+
+static_assert(std::is_trivially_default_constructible<A2>::value, "");
+static_assert(std::is_trivial<A2>::value, "");
+
+static_assert(std::is_trivially_default_constructible<A3>::value, "");
+static_assert(std::is_trivial<A3>::value, "");
+
+static_assert(std::is_trivially_default_constructible<A4>::value, "");
+static_assert(std::is_trivial<A4>::value, "");

From 894c0c889707dd7797c54af8ebe2e91f80e97236 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Tue, 22 Jun 2021 12:43:54 -0400
Subject: [PATCH 295/318] [libc++] Fix CI on release/12.x branch

The new Docker images don't have gcc or g++, only the versioned equivalents.
---
 libcxx/utils/ci/run-buildbot | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot
index 1fb09f3a91aa..d3880816779f 100755
--- a/libcxx/utils/ci/run-buildbot
+++ b/libcxx/utils/ci/run-buildbot
@@ -168,8 +168,8 @@ generic-32bit)
     check-cxx-cxxabi
 ;;
 generic-gcc)
-    export CC=gcc
-    export CXX=g++
+    export CC=gcc-10
+    export CXX=g++-10
     clean
     # FIXME: Re-enable experimental testing on GCC. GCC cares about the order
     #        in which we link -lc++experimental, which causes issues.

From 884040db086936107ec81656aa5b4c607235fb9a Mon Sep 17 00:00:00 2001
From: Tamar Christina <tamar.christina@arm.com>
Date: Thu, 20 May 2021 18:55:11 +0100
Subject: [PATCH 296/318] libsanitizer: Remove cyclades inclusion in sanitizer

The Linux kernel has removed the interface to cyclades from
the latest kernel headers[1] due to them being orphaned for the
past 13 years.

libsanitizer uses this header when compiling against glibc, but
glibcs itself doesn't seem to have any references to cyclades.

Further more it seems that the driver is broken in the kernel and
the firmware doesn't seem to be available anymore.

As such since this is breaking the build of libsanitizer (and so the
GCC bootstrap[2]) I propose to remove this.

[1] https://lkml.org/lkml/2021/3/2/153
[2] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100379

Reviewed By: eugenis

Differential Revision: https://reviews.llvm.org/D102059

(cherry picked from commit 68d5235cb58f988c71b403334cd9482d663841ab)
---
 .../sanitizer_common_interceptors_ioctl.inc           |  9 ---------
 .../sanitizer_platform_limits_posix.cpp               | 11 -----------
 .../sanitizer_platform_limits_posix.h                 | 10 ----------
 3 files changed, 30 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc
index 7f181258eab5..b7da65987557 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc
@@ -370,15 +370,6 @@ static void ioctl_table_fill() {
 
 #if SANITIZER_GLIBC
   // _(SIOCDEVPLIP, WRITE, struct_ifreq_sz); // the same as EQL_ENSLAVE
-  _(CYGETDEFTHRESH, WRITE, sizeof(int));
-  _(CYGETDEFTIMEOUT, WRITE, sizeof(int));
-  _(CYGETMON, WRITE, struct_cyclades_monitor_sz);
-  _(CYGETTHRESH, WRITE, sizeof(int));
-  _(CYGETTIMEOUT, WRITE, sizeof(int));
-  _(CYSETDEFTHRESH, NONE, 0);
-  _(CYSETDEFTIMEOUT, NONE, 0);
-  _(CYSETTHRESH, NONE, 0);
-  _(CYSETTIMEOUT, NONE, 0);
   _(EQL_EMANCIPATE, WRITE, struct_ifreq_sz);
   _(EQL_ENSLAVE, WRITE, struct_ifreq_sz);
   _(EQL_GETMASTRCFG, WRITE, struct_ifreq_sz);
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp
index 12dd39e674ac..7abaeb880bf3 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp
@@ -143,7 +143,6 @@ typedef struct user_fpregs elf_fpregset_t;
 # include <sys/procfs.h>
 #endif
 #include <sys/user.h>
-#include <linux/cyclades.h>
 #include <linux/if_eql.h>
 #include <linux/if_plip.h>
 #include <linux/lp.h>
@@ -459,7 +458,6 @@ unsigned struct_ElfW_Phdr_sz = sizeof(Elf_Phdr);
 
 #if SANITIZER_GLIBC
   unsigned struct_ax25_parms_struct_sz = sizeof(struct ax25_parms_struct);
-  unsigned struct_cyclades_monitor_sz = sizeof(struct cyclades_monitor);
 #if EV_VERSION > (0x010000)
   unsigned struct_input_keymap_entry_sz = sizeof(struct input_keymap_entry);
 #else
@@ -823,15 +821,6 @@ unsigned struct_ElfW_Phdr_sz = sizeof(Elf_Phdr);
 #endif // SANITIZER_LINUX
 
 #if SANITIZER_LINUX && !SANITIZER_ANDROID
-  unsigned IOCTL_CYGETDEFTHRESH = CYGETDEFTHRESH;
-  unsigned IOCTL_CYGETDEFTIMEOUT = CYGETDEFTIMEOUT;
-  unsigned IOCTL_CYGETMON = CYGETMON;
-  unsigned IOCTL_CYGETTHRESH = CYGETTHRESH;
-  unsigned IOCTL_CYGETTIMEOUT = CYGETTIMEOUT;
-  unsigned IOCTL_CYSETDEFTHRESH = CYSETDEFTHRESH;
-  unsigned IOCTL_CYSETDEFTIMEOUT = CYSETDEFTIMEOUT;
-  unsigned IOCTL_CYSETTHRESH = CYSETTHRESH;
-  unsigned IOCTL_CYSETTIMEOUT = CYSETTIMEOUT;
   unsigned IOCTL_EQL_EMANCIPATE = EQL_EMANCIPATE;
   unsigned IOCTL_EQL_ENSLAVE = EQL_ENSLAVE;
   unsigned IOCTL_EQL_GETMASTRCFG = EQL_GETMASTRCFG;
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
index 836b178c131b..8a156b7fcb80 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
@@ -983,7 +983,6 @@ extern unsigned struct_vt_mode_sz;
 
 #if SANITIZER_LINUX && !SANITIZER_ANDROID
 extern unsigned struct_ax25_parms_struct_sz;
-extern unsigned struct_cyclades_monitor_sz;
 extern unsigned struct_input_keymap_entry_sz;
 extern unsigned struct_ipx_config_data_sz;
 extern unsigned struct_kbdiacrs_sz;
@@ -1328,15 +1327,6 @@ extern unsigned IOCTL_VT_WAITACTIVE;
 #endif  // SANITIZER_LINUX
 
 #if SANITIZER_LINUX && !SANITIZER_ANDROID
-extern unsigned IOCTL_CYGETDEFTHRESH;
-extern unsigned IOCTL_CYGETDEFTIMEOUT;
-extern unsigned IOCTL_CYGETMON;
-extern unsigned IOCTL_CYGETTHRESH;
-extern unsigned IOCTL_CYGETTIMEOUT;
-extern unsigned IOCTL_CYSETDEFTHRESH;
-extern unsigned IOCTL_CYSETDEFTIMEOUT;
-extern unsigned IOCTL_CYSETTHRESH;
-extern unsigned IOCTL_CYSETTIMEOUT;
 extern unsigned IOCTL_EQL_EMANCIPATE;
 extern unsigned IOCTL_EQL_ENSLAVE;
 extern unsigned IOCTL_EQL_GETMASTRCFG;

From 0680e2b5a1182abc1d2fdb6730ef7d742a4d6345 Mon Sep 17 00:00:00 2001
From: jasonliu <jasonliu.development@gmail.com>
Date: Mon, 12 Apr 2021 19:22:12 +0000
Subject: [PATCH 297/318] [libc++] add `inline` for __open's definition in
 ifstream and ofstream

Summary:

When building with gcc on AIX, it seems that gcc does not like the
`always_inline` without the `inline` keyword.
So adding the inline keywords in for __open in ifstream and ofstream.
That will also make it consistent with __open in basic_filebuf
(it seems we added `inline` there before for gcc build as well).

Differential Revision: https://reviews.llvm.org/D99422

(cherry picked from commit 52e9d80d5db20f23979e409f958736d130387f9e)
---
 libcxx/include/fstream | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/libcxx/include/fstream b/libcxx/include/fstream
index d7d6b46c32d9..7b1bbfe16c01 100644
--- a/libcxx/include/fstream
+++ b/libcxx/include/fstream
@@ -244,7 +244,7 @@ public:
       return open(__p.c_str(), __mode);
     }
 #endif
-    inline _LIBCPP_INLINE_VISIBILITY
+    _LIBCPP_INLINE_VISIBILITY
     basic_filebuf* __open(int __fd, ios_base::openmode __mode);
 #endif
     basic_filebuf* close();
@@ -574,7 +574,7 @@ basic_filebuf<_CharT, _Traits>::open(const char* __s, ios_base::openmode __mode)
 }
 
 template <class _CharT, class _Traits>
-inline _LIBCPP_INLINE_VISIBILITY
+inline
 basic_filebuf<_CharT, _Traits>*
 basic_filebuf<_CharT, _Traits>::__open(int __fd, ios_base::openmode __mode) {
   basic_filebuf<_CharT, _Traits>* __rt = nullptr;
@@ -1326,6 +1326,7 @@ basic_ifstream<_CharT, _Traits>::open(const string& __s, ios_base::openmode __mo
 }
 
 template <class _CharT, class _Traits>
+inline
 void basic_ifstream<_CharT, _Traits>::__open(int __fd,
                                              ios_base::openmode __mode) {
   if (__sb_.__open(__fd, __mode | ios_base::in))
@@ -1539,6 +1540,7 @@ basic_ofstream<_CharT, _Traits>::open(const string& __s, ios_base::openmode __mo
 }
 
 template <class _CharT, class _Traits>
+inline
 void basic_ofstream<_CharT, _Traits>::__open(int __fd,
                                              ios_base::openmode __mode) {
   if (__sb_.__open(__fd, __mode | ios_base::out))

From 5cb4200739750a7866817c22de32b50819cd76b5 Mon Sep 17 00:00:00 2001
From: zoecarver <z.zoelec2@gmail.com>
Date: Thu, 8 Apr 2021 22:01:35 -0700
Subject: [PATCH 298/318] [libcxx] Allow shared_ptr's unique_ptr converting
 constructor to support array types.

Refs: https://bugs.llvm.org/show_bug.cgi?id=32147

Differential Revision: https://reviews.llvm.org/D80882

(cherry picked from commit 097d77d611d1e1b3972be661fdc3caaa4d1824b4)
---
 libcxx/include/memory                         | 10 +--
 .../unique_ptr_Y.pass.cpp                     | 90 +++++++++++++++++++
 .../unique_ptr.pass.cpp                       | 88 +++++++++++++++++-
 3 files changed, 178 insertions(+), 10 deletions(-)

diff --git a/libcxx/include/memory b/libcxx/include/memory
index e02846b4035c..62235cf72b35 100644
--- a/libcxx/include/memory
+++ b/libcxx/include/memory
@@ -2771,7 +2771,6 @@ public:
                    typename enable_if
                    <
                        !is_lvalue_reference<_Dp>::value &&
-                       !is_array<_Yp>::value &&
                        is_convertible<typename unique_ptr<_Yp, _Dp>::pointer, element_type*>::value,
                        __nat
                    >::type = __nat());
@@ -2780,7 +2779,6 @@ public:
                    typename enable_if
                    <
                        is_lvalue_reference<_Dp>::value &&
-                       !is_array<_Yp>::value &&
                        is_convertible<typename unique_ptr<_Yp, _Dp>::pointer, element_type*>::value,
                        __nat
                    >::type = __nat());
@@ -2821,7 +2819,6 @@ public:
     template <class _Yp, class _Dp>
         typename enable_if
         <
-            !is_array<_Yp>::value &&
             is_convertible<typename unique_ptr<_Yp, _Dp>::pointer, element_type*>::value,
             shared_ptr&
         >::type
@@ -3183,7 +3180,6 @@ shared_ptr<_Tp>::shared_ptr(unique_ptr<_Yp, _Dp>&& __r,
                             typename enable_if
                             <
                                 !is_lvalue_reference<_Dp>::value &&
-                                !is_array<_Yp>::value &&
                                 is_convertible<typename unique_ptr<_Yp, _Dp>::pointer, element_type*>::value,
                                 __nat
                             >::type)
@@ -3196,7 +3192,7 @@ shared_ptr<_Tp>::shared_ptr(unique_ptr<_Yp, _Dp>&& __r,
 #endif
     {
         typedef typename __shared_ptr_default_allocator<_Yp>::type _AllocT;
-        typedef __shared_ptr_pointer<_Yp*, _Dp, _AllocT > _CntrlBlk;
+        typedef __shared_ptr_pointer<typename unique_ptr<_Yp, _Dp>::pointer, _Dp, _AllocT > _CntrlBlk;
         __cntrl_ = new _CntrlBlk(__r.get(), __r.get_deleter(), _AllocT());
         __enable_weak_this(__r.get(), __r.get());
     }
@@ -3209,7 +3205,6 @@ shared_ptr<_Tp>::shared_ptr(unique_ptr<_Yp, _Dp>&& __r,
                             typename enable_if
                             <
                                 is_lvalue_reference<_Dp>::value &&
-                                !is_array<_Yp>::value &&
                                 is_convertible<typename unique_ptr<_Yp, _Dp>::pointer, element_type*>::value,
                                 __nat
                             >::type)
@@ -3222,7 +3217,7 @@ shared_ptr<_Tp>::shared_ptr(unique_ptr<_Yp, _Dp>&& __r,
 #endif
     {
         typedef typename __shared_ptr_default_allocator<_Yp>::type _AllocT;
-        typedef __shared_ptr_pointer<_Yp*,
+        typedef __shared_ptr_pointer<typename unique_ptr<_Yp, _Dp>::pointer,
                                      reference_wrapper<typename remove_reference<_Dp>::type>,
                                      _AllocT > _CntrlBlk;
         __cntrl_ = new _CntrlBlk(__r.get(), _VSTD::ref(__r.get_deleter()), _AllocT());
@@ -3306,7 +3301,6 @@ template <class _Yp, class _Dp>
 inline
 typename enable_if
 <
-    !is_array<_Yp>::value &&
     is_convertible<typename unique_ptr<_Yp, _Dp>::pointer,
                    typename shared_ptr<_Tp>::element_type*>::value,
     shared_ptr<_Tp>&
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.assign/unique_ptr_Y.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.assign/unique_ptr_Y.pass.cpp
index 0096897ee07e..f9e1798d8a7a 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.assign/unique_ptr_Y.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.assign/unique_ptr_Y.pass.cpp
@@ -41,6 +41,19 @@ struct A
 
 int A::count = 0;
 
+template <class T>
+struct StatefulArrayDeleter {
+  int state = 0;
+
+  StatefulArrayDeleter(int val = 0) : state(val) {}
+  StatefulArrayDeleter(StatefulArrayDeleter const&) { assert(false); }
+
+  void operator()(T* ptr) {
+    assert(state == 42);
+    delete []ptr;
+  }
+};
+
 int main(int, char**)
 {
     {
@@ -112,5 +125,82 @@ int main(int, char**)
     assert(B::count == 0);
     assert(A::count == 0);
 
+    {
+      std::unique_ptr<A[]> ptr(new A[8]);
+      A* raw_ptr = ptr.get();
+      std::shared_ptr<B> p;
+      p = std::move(ptr);
+      assert(A::count == 8);
+      assert(B::count == 8);
+      assert(p.use_count() == 1);
+      assert(p.get() == raw_ptr);
+      assert(ptr.get() == 0);
+    }
+    assert(A::count == 0);
+    assert(B::count == 0);
+
+    {
+      std::unique_ptr<A[]> ptr(new A[8]);
+      A* raw_ptr = ptr.get();
+      std::shared_ptr<A> p;
+      p = std::move(ptr);
+      assert(A::count == 8);
+      assert(p.use_count() == 1);
+      assert(p.get() == raw_ptr);
+      assert(ptr.get() == 0);
+    }
+    assert(A::count == 0);
+
+    {
+      std::unique_ptr<int[]> ptr(new int[8]);
+      std::shared_ptr<int> p;
+      p = std::move(ptr);
+    }
+
+#if TEST_STD_VER > 14
+    {
+      StatefulArrayDeleter<A> d;
+      std::unique_ptr<A[], StatefulArrayDeleter<A>&> u(new A[4], d);
+      std::shared_ptr<A[]> p;
+      p = std::move(u);
+      d.state = 42;
+      assert(A::count == 4);
+    }
+    assert(A::count == 0);
+    assert(B::count == 0);
+
+    {
+      std::unique_ptr<A[]> ptr(new A[8]);
+      A* raw_ptr = ptr.get();
+      std::shared_ptr<B[]> p;
+      p = std::move(ptr);
+      assert(A::count == 8);
+      assert(B::count == 8);
+      assert(p.use_count() == 1);
+      assert(p.get() == raw_ptr);
+      assert(ptr.get() == 0);
+    }
+    assert(A::count == 0);
+    assert(B::count == 0);
+
+    {
+      std::unique_ptr<A[]> ptr(new A[8]);
+      A* raw_ptr = ptr.get();
+      std::shared_ptr<A[]> p;
+      p = std::move(ptr);
+      assert(A::count == 8);
+      assert(p.use_count() == 1);
+      assert(p.get() == raw_ptr);
+      assert(ptr.get() == 0);
+    }
+    assert(A::count == 0);
+
+    {
+      std::unique_ptr<int[]> ptr(new int[8]);
+      std::shared_ptr<int[]> p;
+      p = std::move(ptr);
+    }
+#endif // TEST_STD_VER >= 14
+
   return 0;
 }
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/unique_ptr.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/unique_ptr.pass.cpp
index 398c64ee6d74..ad88a3e8a7df 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/unique_ptr.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/unique_ptr.pass.cpp
@@ -10,7 +10,7 @@
 
 // <memory>
 
-// template <class Y, class D> explicit shared_ptr(unique_ptr<Y, D>&&r);
+// template <class Y, class D> shared_ptr(unique_ptr<Y, D>&&r);
 
 #include <memory>
 #include <new>
@@ -69,6 +69,19 @@ struct StatefulDeleter {
   }
 };
 
+template <class T>
+struct StatefulArrayDeleter {
+  int state = 0;
+
+  StatefulArrayDeleter(int val = 0) : state(val) {}
+  StatefulArrayDeleter(StatefulArrayDeleter const&) { assert(false); }
+
+  void operator()(T* ptr) {
+    assert(state == 42);
+    delete []ptr;
+  }
+};
+
 int main(int, char**)
 {
     {
@@ -135,5 +148,76 @@ int main(int, char**)
     std::shared_ptr<int> s = std::move(u);
     }
 
-  return 0;
+    assert(A::count == 0);
+    {
+      std::unique_ptr<A[]> ptr(new A[8]);
+      A* raw_ptr = ptr.get();
+      std::shared_ptr<B> p(std::move(ptr));
+      assert(A::count == 8);
+      assert(B::count == 8);
+      assert(p.use_count() == 1);
+      assert(p.get() == raw_ptr);
+      assert(ptr.get() == 0);
+    }
+    assert(A::count == 0);
+    assert(B::count == 0);
+
+    {
+      std::unique_ptr<A[]> ptr(new A[8]);
+      A* raw_ptr = ptr.get();
+      std::shared_ptr<A> p(std::move(ptr));
+      assert(A::count == 8);
+      assert(p.use_count() == 1);
+      assert(p.get() == raw_ptr);
+      assert(ptr.get() == 0);
+    }
+    assert(A::count == 0);
+
+    {
+      std::unique_ptr<int[]> ptr(new int[8]);
+      std::shared_ptr<int> p(std::move(ptr));
+    }
+
+#if TEST_STD_VER > 14
+    {
+      StatefulArrayDeleter<A> d;
+      std::unique_ptr<A[], StatefulArrayDeleter<A>&> u(new A[4], d);
+      std::shared_ptr<A[]> p(std::move(u));
+      d.state = 42;
+      assert(A::count == 4);
+    }
+    assert(A::count == 0);
+    assert(B::count == 0);
+
+    {
+      std::unique_ptr<A[]> ptr(new A[8]);
+      A* raw_ptr = ptr.get();
+      std::shared_ptr<B[]> p(std::move(ptr));
+      assert(A::count == 8);
+      assert(B::count == 8);
+      assert(p.use_count() == 1);
+      assert(p.get() == raw_ptr);
+      assert(ptr.get() == 0);
+    }
+    assert(A::count == 0);
+    assert(B::count == 0);
+
+    {
+      std::unique_ptr<A[]> ptr(new A[8]);
+      A* raw_ptr = ptr.get();
+      std::shared_ptr<A[]> p(std::move(ptr));
+      assert(A::count == 8);
+      assert(p.use_count() == 1);
+      assert(p.get() == raw_ptr);
+      assert(ptr.get() == 0);
+    }
+    assert(A::count == 0);
+
+    {
+      std::unique_ptr<int[]> ptr(new int[8]);
+      std::shared_ptr<int[]> p(std::move(ptr));
+    }
+#endif // TEST_STD_VER >= 14
+
+    return 0;
 }

From fa21c5d4cf8cf00347e89bc3c50fbaf6a5c185dd Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Tue, 19 Jan 2021 13:04:01 -0500
Subject: [PATCH 299/318] [libc++] Make feature-test macros consistent with
 availability macros

Before this patch, feature-test macros didn't take special availability
markup into account, which means that feature-test macros can sometimes
appear to "lie". For example, if you compile in C++20 mode and target
macOS 10.13, the __cpp_lib_filesystem feature-test macro will be provided
even though the <filesystem> declarations are marked as unavailable.
This patch fixes that.

rdar://68142369

Differential Revision: https://reviews.llvm.org/D94983

(cherry picked from commit 76fc35752d19ac605c1c1fd757af9c7c3bb4a906)
---
 libcxx/include/__availability                 |  43 +++++++
 libcxx/include/version                        |  16 +--
 .../fs.req.macros/feature_macro.pass.cpp      |  30 -----
 .../atomic.version.pass.cpp                   |   8 +-
 .../barrier.version.pass.cpp                  |   8 +-
 .../filesystem.version.pass.cpp               |  48 +++++---
 .../latch.version.pass.cpp                    |   8 +-
 .../semaphore.version.pass.cpp                |   8 +-
 .../shared_mutex.version.pass.cpp             |  28 ++---
 .../version.version.pass.cpp                  | 108 ++++++++++--------
 .../generate_feature_test_macro_components.py |  26 +++--
 11 files changed, 192 insertions(+), 139 deletions(-)
 delete mode 100644 libcxx/test/std/input.output/filesystems/fs.req.macros/feature_macro.pass.cpp

diff --git a/libcxx/include/__availability b/libcxx/include/__availability
index db2267c8eb16..cc3b6fabdab1 100644
--- a/libcxx/include/__availability
+++ b/libcxx/include/__availability
@@ -43,6 +43,14 @@
 // as unavailable. When vendors decide to ship the feature as part of their
 // shared library, they can update the markup appropriately.
 //
+// Furthermore, many features in the standard library have corresponding
+// feature-test macros. When a feature is made unavailable on some deployment
+// target, a macro should be defined to signal that it is unavailable. That
+// macro can then be picked up when feature-test macros are generated (see
+// generate_feature_test_macro_components.py) to make sure that feature-test
+// macros don't announce a feature as being implemented if it has been marked
+// as unavailable.
+//
 // Note that this mechanism is disabled by default in the "upstream" libc++.
 // Availability annotations are only meaningful when shipping libc++ inside
 // a platform (i.e. as a system library), and so vendors that want them should
@@ -76,6 +84,8 @@
     // This controls the availability of std::shared_mutex and std::shared_timed_mutex,
     // which were added to the dylib later.
 #   define _LIBCPP_AVAILABILITY_SHARED_MUTEX
+// #   define _LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_mutex
+// #   define _LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_timed_mutex
 
     // These macros control the availability of std::bad_optional_access and
     // other exception types. These were put in the shared library to prevent
@@ -114,6 +124,7 @@
 #   define _LIBCPP_AVAILABILITY_FILESYSTEM
 #   define _LIBCPP_AVAILABILITY_FILESYSTEM_PUSH
 #   define _LIBCPP_AVAILABILITY_FILESYSTEM_POP
+// #   define _LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_filesystem
 
     // This controls the availability of std::to_chars.
 #   define _LIBCPP_AVAILABILITY_TO_CHARS
@@ -122,6 +133,10 @@
     // which requires shared library support for various operations
     // (see libcxx/src/atomic.cpp).
 #   define _LIBCPP_AVAILABILITY_SYNC
+// #   define _LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_atomic_wait
+// #   define _LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_barrier
+// #   define _LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_latch
+// #   define _LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_semaphore
 
 #elif defined(__APPLE__)
 
@@ -130,6 +145,14 @@
         __attribute__((availability(ios,strict,introduced=10.0)))               \
         __attribute__((availability(tvos,strict,introduced=10.0)))              \
         __attribute__((availability(watchos,strict,introduced=3.0)))
+#   if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101200) ||    \
+        (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 100000) || \
+        (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 100000) ||         \
+        (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 30000)
+#       define _LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_mutex
+#       define _LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_timed_mutex
+#   endif
+
 #   define _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS                             \
         __attribute__((availability(macosx,strict,introduced=10.13)))           \
         __attribute__((availability(ios,strict,introduced=11.0)))               \
@@ -139,27 +162,34 @@
         _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS
 #   define _LIBCPP_AVAILABILITY_BAD_ANY_CAST                                    \
         _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS
+
 #   define _LIBCPP_AVAILABILITY_UNCAUGHT_EXCEPTIONS                             \
         __attribute__((availability(macosx,strict,introduced=10.12)))           \
         __attribute__((availability(ios,strict,introduced=10.0)))               \
         __attribute__((availability(tvos,strict,introduced=10.0)))              \
         __attribute__((availability(watchos,strict,introduced=3.0)))
+
 #   define _LIBCPP_AVAILABILITY_SIZED_NEW_DELETE                                \
         __attribute__((availability(macosx,strict,introduced=10.12)))           \
         __attribute__((availability(ios,strict,introduced=10.0)))               \
         __attribute__((availability(tvos,strict,introduced=10.0)))              \
         __attribute__((availability(watchos,strict,introduced=3.0)))
+
 #   define _LIBCPP_AVAILABILITY_FUTURE_ERROR                                    \
         __attribute__((availability(ios,strict,introduced=6.0)))
+
 #   define _LIBCPP_AVAILABILITY_TYPEINFO_VTABLE                                 \
         __attribute__((availability(macosx,strict,introduced=10.9)))            \
         __attribute__((availability(ios,strict,introduced=7.0)))
+
 #   define _LIBCPP_AVAILABILITY_LOCALE_CATEGORY                                 \
         __attribute__((availability(macosx,strict,introduced=10.9)))            \
         __attribute__((availability(ios,strict,introduced=7.0)))
+
 #   define _LIBCPP_AVAILABILITY_ATOMIC_SHARED_PTR                               \
         __attribute__((availability(macosx,strict,introduced=10.9)))            \
         __attribute__((availability(ios,strict,introduced=7.0)))
+
 #   define _LIBCPP_AVAILABILITY_FILESYSTEM                                      \
         __attribute__((availability(macosx,strict,introduced=10.15)))           \
         __attribute__((availability(ios,strict,introduced=13.0)))               \
@@ -175,10 +205,23 @@
         _Pragma("clang attribute pop")                                          \
         _Pragma("clang attribute pop")                                          \
         _Pragma("clang attribute pop")
+#   if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101500) ||    \
+        (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 130000) || \
+        (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 130000) ||         \
+        (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 60000)
+#       define _LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_filesystem
+#   endif
+
 #   define _LIBCPP_AVAILABILITY_TO_CHARS                                        \
         _LIBCPP_AVAILABILITY_FILESYSTEM
+
+    // Note: Those are not ABI-stable yet, so we can't ship them.
 #   define _LIBCPP_AVAILABILITY_SYNC                                            \
         __attribute__((unavailable))
+#   define _LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_atomic_wait
+#   define _LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_barrier
+#   define _LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_latch
+#   define _LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_semaphore
 
 #else
 
diff --git a/libcxx/include/version b/libcxx/include/version
index 813bc1ab9e6a..c021db8bddd7 100644
--- a/libcxx/include/version
+++ b/libcxx/include/version
@@ -184,7 +184,7 @@ __cpp_lib_void_t                                        201411L <type_traits>
 # define __cpp_lib_quoted_string_io                     201304L
 # define __cpp_lib_result_of_sfinae                     201210L
 # define __cpp_lib_robust_nonmodifying_seq_ops          201304L
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_timed_mutex)
 #   define __cpp_lib_shared_timed_mutex                 201402L
 # endif
 # define __cpp_lib_string_udls                          201304L
@@ -213,7 +213,9 @@ __cpp_lib_void_t                                        201411L <type_traits>
 # define __cpp_lib_clamp                                201603L
 # define __cpp_lib_enable_shared_from_this              201603L
 // # define __cpp_lib_execution                            201603L
-# define __cpp_lib_filesystem                           201703L
+# if !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_filesystem)
+#   define __cpp_lib_filesystem                         201703L
+# endif
 # define __cpp_lib_gcd_lcm                              201606L
 // # define __cpp_lib_hardware_interference_size           201703L
 # if defined(_LIBCPP_HAS_UNIQUE_OBJECT_REPRESENTATIONS)
@@ -241,7 +243,7 @@ __cpp_lib_void_t                                        201411L <type_traits>
 # define __cpp_lib_raw_memory_algorithms                201606L
 # define __cpp_lib_sample                               201603L
 # define __cpp_lib_scoped_lock                          201703L
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_mutex)
 #   define __cpp_lib_shared_mutex                       201505L
 # endif
 # define __cpp_lib_shared_ptr_arrays                    201611L
@@ -279,10 +281,10 @@ __cpp_lib_void_t                                        201411L <type_traits>
 # if !defined(_LIBCPP_HAS_NO_THREADS)
 // #   define __cpp_lib_atomic_value_initialization        201911L
 # endif
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_atomic_wait)
 #   define __cpp_lib_atomic_wait                        201907L
 # endif
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_barrier)
 #   define __cpp_lib_barrier                            201907L
 # endif
 // # define __cpp_lib_bind_front                           201907L
@@ -326,7 +328,7 @@ __cpp_lib_void_t                                        201411L <type_traits>
 # if !defined(_LIBCPP_HAS_NO_THREADS)
 // #   define __cpp_lib_jthread                            201911L
 # endif
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_latch)
 #   define __cpp_lib_latch                              201907L
 # endif
 # define __cpp_lib_list_remove_return_type              201806L
@@ -336,7 +338,7 @@ __cpp_lib_void_t                                        201411L <type_traits>
 // # define __cpp_lib_polymorphic_allocator                201902L
 // # define __cpp_lib_ranges                               201811L
 # define __cpp_lib_remove_cvref                         201711L
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_semaphore)
 #   define __cpp_lib_semaphore                          201907L
 # endif
 # define __cpp_lib_shift                                201806L
diff --git a/libcxx/test/std/input.output/filesystems/fs.req.macros/feature_macro.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.req.macros/feature_macro.pass.cpp
deleted file mode 100644
index 6b7052b2403d..000000000000
--- a/libcxx/test/std/input.output/filesystems/fs.req.macros/feature_macro.pass.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <filesystem>
-
-// #define __cpp_lib_filesystem 201703L
-
-#include <filesystem>
-#include "test_macros.h"
-
-#if TEST_STD_VER >= 17
-#ifndef __cpp_lib_filesystem
-#error Filesystem feature test macro is not defined  (__cpp_lib_filesystem)
-#elif __cpp_lib_filesystem != 201703L
-#error Filesystem feature test macro has an incorrect value (__cpp_lib_filesystem)
-#endif
-#else // TEST_STD_VER < 17
-#ifdef __cpp_lib_filesystem
-#error Filesystem feature test macro should not be defined before C++17
-#endif
-#endif
-
-int main(int, char**) {
-  return 0;
-}
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.pass.cpp
index b964e0c6921e..c8a837115ade 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.pass.cpp
@@ -248,7 +248,7 @@
 #   endif
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_atomic_wait)
 #   ifndef __cpp_lib_atomic_wait
 #     error "__cpp_lib_atomic_wait should be defined in c++20"
 #   endif
@@ -257,7 +257,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_atomic_wait
-#     error "__cpp_lib_atomic_wait should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!"
+#     error "__cpp_lib_atomic_wait should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_atomic_wait) is not defined!"
 #   endif
 # endif
 
@@ -367,7 +367,7 @@
 #   endif
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_atomic_wait)
 #   ifndef __cpp_lib_atomic_wait
 #     error "__cpp_lib_atomic_wait should be defined in c++2b"
 #   endif
@@ -376,7 +376,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_atomic_wait
-#     error "__cpp_lib_atomic_wait should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!"
+#     error "__cpp_lib_atomic_wait should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_atomic_wait) is not defined!"
 #   endif
 # endif
 
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/barrier.version.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/barrier.version.pass.cpp
index b193095403e9..80f7d9ca24b4 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/barrier.version.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/barrier.version.pass.cpp
@@ -44,7 +44,7 @@
 
 #elif TEST_STD_VER == 20
 
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_barrier)
 #   ifndef __cpp_lib_barrier
 #     error "__cpp_lib_barrier should be defined in c++20"
 #   endif
@@ -53,13 +53,13 @@
 #   endif
 # else
 #   ifdef __cpp_lib_barrier
-#     error "__cpp_lib_barrier should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!"
+#     error "__cpp_lib_barrier should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_barrier) is not defined!"
 #   endif
 # endif
 
 #elif TEST_STD_VER > 20
 
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_barrier)
 #   ifndef __cpp_lib_barrier
 #     error "__cpp_lib_barrier should be defined in c++2b"
 #   endif
@@ -68,7 +68,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_barrier
-#     error "__cpp_lib_barrier should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!"
+#     error "__cpp_lib_barrier should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_barrier) is not defined!"
 #   endif
 # endif
 
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.pass.cpp
index 6e47bdf3b5fe..c361569cb1d6 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.pass.cpp
@@ -51,11 +51,17 @@
 #   error "__cpp_lib_char8_t should not be defined before c++20"
 # endif
 
-# ifndef __cpp_lib_filesystem
-#   error "__cpp_lib_filesystem should be defined in c++17"
-# endif
-# if __cpp_lib_filesystem != 201703L
-#   error "__cpp_lib_filesystem should have the value 201703L in c++17"
+# if !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_filesystem)
+#   ifndef __cpp_lib_filesystem
+#     error "__cpp_lib_filesystem should be defined in c++17"
+#   endif
+#   if __cpp_lib_filesystem != 201703L
+#     error "__cpp_lib_filesystem should have the value 201703L in c++17"
+#   endif
+# else
+#   ifdef __cpp_lib_filesystem
+#     error "__cpp_lib_filesystem should not be defined when !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_filesystem) is not defined!"
+#   endif
 # endif
 
 #elif TEST_STD_VER == 20
@@ -73,11 +79,17 @@
 #   endif
 # endif
 
-# ifndef __cpp_lib_filesystem
-#   error "__cpp_lib_filesystem should be defined in c++20"
-# endif
-# if __cpp_lib_filesystem != 201703L
-#   error "__cpp_lib_filesystem should have the value 201703L in c++20"
+# if !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_filesystem)
+#   ifndef __cpp_lib_filesystem
+#     error "__cpp_lib_filesystem should be defined in c++20"
+#   endif
+#   if __cpp_lib_filesystem != 201703L
+#     error "__cpp_lib_filesystem should have the value 201703L in c++20"
+#   endif
+# else
+#   ifdef __cpp_lib_filesystem
+#     error "__cpp_lib_filesystem should not be defined when !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_filesystem) is not defined!"
+#   endif
 # endif
 
 #elif TEST_STD_VER > 20
@@ -95,11 +107,17 @@
 #   endif
 # endif
 
-# ifndef __cpp_lib_filesystem
-#   error "__cpp_lib_filesystem should be defined in c++2b"
-# endif
-# if __cpp_lib_filesystem != 201703L
-#   error "__cpp_lib_filesystem should have the value 201703L in c++2b"
+# if !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_filesystem)
+#   ifndef __cpp_lib_filesystem
+#     error "__cpp_lib_filesystem should be defined in c++2b"
+#   endif
+#   if __cpp_lib_filesystem != 201703L
+#     error "__cpp_lib_filesystem should have the value 201703L in c++2b"
+#   endif
+# else
+#   ifdef __cpp_lib_filesystem
+#     error "__cpp_lib_filesystem should not be defined when !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_filesystem) is not defined!"
+#   endif
 # endif
 
 #endif // TEST_STD_VER > 20
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/latch.version.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/latch.version.pass.cpp
index 29e8fd617bbf..56db3ba66b45 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/latch.version.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/latch.version.pass.cpp
@@ -44,7 +44,7 @@
 
 #elif TEST_STD_VER == 20
 
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_latch)
 #   ifndef __cpp_lib_latch
 #     error "__cpp_lib_latch should be defined in c++20"
 #   endif
@@ -53,13 +53,13 @@
 #   endif
 # else
 #   ifdef __cpp_lib_latch
-#     error "__cpp_lib_latch should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!"
+#     error "__cpp_lib_latch should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_latch) is not defined!"
 #   endif
 # endif
 
 #elif TEST_STD_VER > 20
 
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_latch)
 #   ifndef __cpp_lib_latch
 #     error "__cpp_lib_latch should be defined in c++2b"
 #   endif
@@ -68,7 +68,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_latch
-#     error "__cpp_lib_latch should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!"
+#     error "__cpp_lib_latch should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_latch) is not defined!"
 #   endif
 # endif
 
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/semaphore.version.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/semaphore.version.pass.cpp
index febeb6f6c615..79e31aa06a3f 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/semaphore.version.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/semaphore.version.pass.cpp
@@ -44,7 +44,7 @@
 
 #elif TEST_STD_VER == 20
 
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_semaphore)
 #   ifndef __cpp_lib_semaphore
 #     error "__cpp_lib_semaphore should be defined in c++20"
 #   endif
@@ -53,13 +53,13 @@
 #   endif
 # else
 #   ifdef __cpp_lib_semaphore
-#     error "__cpp_lib_semaphore should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!"
+#     error "__cpp_lib_semaphore should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_semaphore) is not defined!"
 #   endif
 # endif
 
 #elif TEST_STD_VER > 20
 
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_semaphore)
 #   ifndef __cpp_lib_semaphore
 #     error "__cpp_lib_semaphore should be defined in c++2b"
 #   endif
@@ -68,7 +68,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_semaphore
-#     error "__cpp_lib_semaphore should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!"
+#     error "__cpp_lib_semaphore should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_semaphore) is not defined!"
 #   endif
 # endif
 
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/shared_mutex.version.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/shared_mutex.version.pass.cpp
index 953fd0a37790..d26a453f83c3 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/shared_mutex.version.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/shared_mutex.version.pass.cpp
@@ -41,7 +41,7 @@
 #   error "__cpp_lib_shared_mutex should not be defined before c++17"
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_timed_mutex)
 #   ifndef __cpp_lib_shared_timed_mutex
 #     error "__cpp_lib_shared_timed_mutex should be defined in c++14"
 #   endif
@@ -50,13 +50,13 @@
 #   endif
 # else
 #   ifdef __cpp_lib_shared_timed_mutex
-#     error "__cpp_lib_shared_timed_mutex should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!"
+#     error "__cpp_lib_shared_timed_mutex should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_timed_mutex) is not defined!"
 #   endif
 # endif
 
 #elif TEST_STD_VER == 17
 
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_mutex)
 #   ifndef __cpp_lib_shared_mutex
 #     error "__cpp_lib_shared_mutex should be defined in c++17"
 #   endif
@@ -65,11 +65,11 @@
 #   endif
 # else
 #   ifdef __cpp_lib_shared_mutex
-#     error "__cpp_lib_shared_mutex should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!"
+#     error "__cpp_lib_shared_mutex should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_mutex) is not defined!"
 #   endif
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_timed_mutex)
 #   ifndef __cpp_lib_shared_timed_mutex
 #     error "__cpp_lib_shared_timed_mutex should be defined in c++17"
 #   endif
@@ -78,13 +78,13 @@
 #   endif
 # else
 #   ifdef __cpp_lib_shared_timed_mutex
-#     error "__cpp_lib_shared_timed_mutex should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!"
+#     error "__cpp_lib_shared_timed_mutex should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_timed_mutex) is not defined!"
 #   endif
 # endif
 
 #elif TEST_STD_VER == 20
 
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_mutex)
 #   ifndef __cpp_lib_shared_mutex
 #     error "__cpp_lib_shared_mutex should be defined in c++20"
 #   endif
@@ -93,11 +93,11 @@
 #   endif
 # else
 #   ifdef __cpp_lib_shared_mutex
-#     error "__cpp_lib_shared_mutex should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!"
+#     error "__cpp_lib_shared_mutex should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_mutex) is not defined!"
 #   endif
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_timed_mutex)
 #   ifndef __cpp_lib_shared_timed_mutex
 #     error "__cpp_lib_shared_timed_mutex should be defined in c++20"
 #   endif
@@ -106,13 +106,13 @@
 #   endif
 # else
 #   ifdef __cpp_lib_shared_timed_mutex
-#     error "__cpp_lib_shared_timed_mutex should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!"
+#     error "__cpp_lib_shared_timed_mutex should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_timed_mutex) is not defined!"
 #   endif
 # endif
 
 #elif TEST_STD_VER > 20
 
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_mutex)
 #   ifndef __cpp_lib_shared_mutex
 #     error "__cpp_lib_shared_mutex should be defined in c++2b"
 #   endif
@@ -121,11 +121,11 @@
 #   endif
 # else
 #   ifdef __cpp_lib_shared_mutex
-#     error "__cpp_lib_shared_mutex should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!"
+#     error "__cpp_lib_shared_mutex should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_mutex) is not defined!"
 #   endif
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_timed_mutex)
 #   ifndef __cpp_lib_shared_timed_mutex
 #     error "__cpp_lib_shared_timed_mutex should be defined in c++2b"
 #   endif
@@ -134,7 +134,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_shared_timed_mutex
-#     error "__cpp_lib_shared_timed_mutex should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!"
+#     error "__cpp_lib_shared_timed_mutex should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_timed_mutex) is not defined!"
 #   endif
 # endif
 
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.pass.cpp
index 9e96e2e116e0..1de99be54e30 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.pass.cpp
@@ -1133,7 +1133,7 @@
 #   error "__cpp_lib_shared_ptr_weak_type should not be defined before c++17"
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_timed_mutex)
 #   ifndef __cpp_lib_shared_timed_mutex
 #     error "__cpp_lib_shared_timed_mutex should be defined in c++14"
 #   endif
@@ -1142,7 +1142,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_shared_timed_mutex
-#     error "__cpp_lib_shared_timed_mutex should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!"
+#     error "__cpp_lib_shared_timed_mutex should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_timed_mutex) is not defined!"
 #   endif
 # endif
 
@@ -1534,11 +1534,17 @@
 #   endif
 # endif
 
-# ifndef __cpp_lib_filesystem
-#   error "__cpp_lib_filesystem should be defined in c++17"
-# endif
-# if __cpp_lib_filesystem != 201703L
-#   error "__cpp_lib_filesystem should have the value 201703L in c++17"
+# if !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_filesystem)
+#   ifndef __cpp_lib_filesystem
+#     error "__cpp_lib_filesystem should be defined in c++17"
+#   endif
+#   if __cpp_lib_filesystem != 201703L
+#     error "__cpp_lib_filesystem should have the value 201703L in c++17"
+#   endif
+# else
+#   ifdef __cpp_lib_filesystem
+#     error "__cpp_lib_filesystem should not be defined when !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_filesystem) is not defined!"
+#   endif
 # endif
 
 # ifndef __cpp_lib_gcd_lcm
@@ -1883,7 +1889,7 @@
 #   error "__cpp_lib_semaphore should not be defined before c++20"
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_mutex)
 #   ifndef __cpp_lib_shared_mutex
 #     error "__cpp_lib_shared_mutex should be defined in c++17"
 #   endif
@@ -1892,7 +1898,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_shared_mutex
-#     error "__cpp_lib_shared_mutex should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!"
+#     error "__cpp_lib_shared_mutex should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_mutex) is not defined!"
 #   endif
 # endif
 
@@ -1910,7 +1916,7 @@
 #   error "__cpp_lib_shared_ptr_weak_type should have the value 201606L in c++17"
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_timed_mutex)
 #   ifndef __cpp_lib_shared_timed_mutex
 #     error "__cpp_lib_shared_timed_mutex should be defined in c++17"
 #   endif
@@ -1919,7 +1925,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_shared_timed_mutex
-#     error "__cpp_lib_shared_timed_mutex should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!"
+#     error "__cpp_lib_shared_timed_mutex should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_timed_mutex) is not defined!"
 #   endif
 # endif
 
@@ -2223,7 +2229,7 @@
 #   endif
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_atomic_wait)
 #   ifndef __cpp_lib_atomic_wait
 #     error "__cpp_lib_atomic_wait should be defined in c++20"
 #   endif
@@ -2232,11 +2238,11 @@
 #   endif
 # else
 #   ifdef __cpp_lib_atomic_wait
-#     error "__cpp_lib_atomic_wait should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!"
+#     error "__cpp_lib_atomic_wait should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_atomic_wait) is not defined!"
 #   endif
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_barrier)
 #   ifndef __cpp_lib_barrier
 #     error "__cpp_lib_barrier should be defined in c++20"
 #   endif
@@ -2245,7 +2251,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_barrier
-#     error "__cpp_lib_barrier should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!"
+#     error "__cpp_lib_barrier should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_barrier) is not defined!"
 #   endif
 # endif
 
@@ -2575,11 +2581,17 @@
 #   endif
 # endif
 
-# ifndef __cpp_lib_filesystem
-#   error "__cpp_lib_filesystem should be defined in c++20"
-# endif
-# if __cpp_lib_filesystem != 201703L
-#   error "__cpp_lib_filesystem should have the value 201703L in c++20"
+# if !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_filesystem)
+#   ifndef __cpp_lib_filesystem
+#     error "__cpp_lib_filesystem should be defined in c++20"
+#   endif
+#   if __cpp_lib_filesystem != 201703L
+#     error "__cpp_lib_filesystem should have the value 201703L in c++20"
+#   endif
+# else
+#   ifdef __cpp_lib_filesystem
+#     error "__cpp_lib_filesystem should not be defined when !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_filesystem) is not defined!"
+#   endif
 # endif
 
 # ifndef __cpp_lib_gcd_lcm
@@ -2795,7 +2807,7 @@
 #   endif
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_latch)
 #   ifndef __cpp_lib_latch
 #     error "__cpp_lib_latch should be defined in c++20"
 #   endif
@@ -2804,7 +2816,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_latch
-#     error "__cpp_lib_latch should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!"
+#     error "__cpp_lib_latch should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_latch) is not defined!"
 #   endif
 # endif
 
@@ -3019,7 +3031,7 @@
 #   error "__cpp_lib_scoped_lock should have the value 201703L in c++20"
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_semaphore)
 #   ifndef __cpp_lib_semaphore
 #     error "__cpp_lib_semaphore should be defined in c++20"
 #   endif
@@ -3028,11 +3040,11 @@
 #   endif
 # else
 #   ifdef __cpp_lib_semaphore
-#     error "__cpp_lib_semaphore should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!"
+#     error "__cpp_lib_semaphore should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_semaphore) is not defined!"
 #   endif
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_mutex)
 #   ifndef __cpp_lib_shared_mutex
 #     error "__cpp_lib_shared_mutex should be defined in c++20"
 #   endif
@@ -3041,7 +3053,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_shared_mutex
-#     error "__cpp_lib_shared_mutex should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!"
+#     error "__cpp_lib_shared_mutex should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_mutex) is not defined!"
 #   endif
 # endif
 
@@ -3059,7 +3071,7 @@
 #   error "__cpp_lib_shared_ptr_weak_type should have the value 201606L in c++20"
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_timed_mutex)
 #   ifndef __cpp_lib_shared_timed_mutex
 #     error "__cpp_lib_shared_timed_mutex should be defined in c++20"
 #   endif
@@ -3068,7 +3080,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_shared_timed_mutex
-#     error "__cpp_lib_shared_timed_mutex should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!"
+#     error "__cpp_lib_shared_timed_mutex should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_timed_mutex) is not defined!"
 #   endif
 # endif
 
@@ -3429,7 +3441,7 @@
 #   endif
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_atomic_wait)
 #   ifndef __cpp_lib_atomic_wait
 #     error "__cpp_lib_atomic_wait should be defined in c++2b"
 #   endif
@@ -3438,11 +3450,11 @@
 #   endif
 # else
 #   ifdef __cpp_lib_atomic_wait
-#     error "__cpp_lib_atomic_wait should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!"
+#     error "__cpp_lib_atomic_wait should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_atomic_wait) is not defined!"
 #   endif
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_barrier)
 #   ifndef __cpp_lib_barrier
 #     error "__cpp_lib_barrier should be defined in c++2b"
 #   endif
@@ -3451,7 +3463,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_barrier
-#     error "__cpp_lib_barrier should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!"
+#     error "__cpp_lib_barrier should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_barrier) is not defined!"
 #   endif
 # endif
 
@@ -3781,11 +3793,17 @@
 #   endif
 # endif
 
-# ifndef __cpp_lib_filesystem
-#   error "__cpp_lib_filesystem should be defined in c++2b"
-# endif
-# if __cpp_lib_filesystem != 201703L
-#   error "__cpp_lib_filesystem should have the value 201703L in c++2b"
+# if !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_filesystem)
+#   ifndef __cpp_lib_filesystem
+#     error "__cpp_lib_filesystem should be defined in c++2b"
+#   endif
+#   if __cpp_lib_filesystem != 201703L
+#     error "__cpp_lib_filesystem should have the value 201703L in c++2b"
+#   endif
+# else
+#   ifdef __cpp_lib_filesystem
+#     error "__cpp_lib_filesystem should not be defined when !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_filesystem) is not defined!"
+#   endif
 # endif
 
 # ifndef __cpp_lib_gcd_lcm
@@ -4004,7 +4022,7 @@
 #   endif
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_latch)
 #   ifndef __cpp_lib_latch
 #     error "__cpp_lib_latch should be defined in c++2b"
 #   endif
@@ -4013,7 +4031,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_latch
-#     error "__cpp_lib_latch should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!"
+#     error "__cpp_lib_latch should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_latch) is not defined!"
 #   endif
 # endif
 
@@ -4228,7 +4246,7 @@
 #   error "__cpp_lib_scoped_lock should have the value 201703L in c++2b"
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_semaphore)
 #   ifndef __cpp_lib_semaphore
 #     error "__cpp_lib_semaphore should be defined in c++2b"
 #   endif
@@ -4237,11 +4255,11 @@
 #   endif
 # else
 #   ifdef __cpp_lib_semaphore
-#     error "__cpp_lib_semaphore should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!"
+#     error "__cpp_lib_semaphore should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_semaphore) is not defined!"
 #   endif
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_mutex)
 #   ifndef __cpp_lib_shared_mutex
 #     error "__cpp_lib_shared_mutex should be defined in c++2b"
 #   endif
@@ -4250,7 +4268,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_shared_mutex
-#     error "__cpp_lib_shared_mutex should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!"
+#     error "__cpp_lib_shared_mutex should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_mutex) is not defined!"
 #   endif
 # endif
 
@@ -4268,7 +4286,7 @@
 #   error "__cpp_lib_shared_ptr_weak_type should have the value 201606L in c++2b"
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS)
+# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_timed_mutex)
 #   ifndef __cpp_lib_shared_timed_mutex
 #     error "__cpp_lib_shared_timed_mutex should be defined in c++2b"
 #   endif
@@ -4277,7 +4295,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_shared_timed_mutex
-#     error "__cpp_lib_shared_timed_mutex should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) is not defined!"
+#     error "__cpp_lib_shared_timed_mutex should not be defined when !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_timed_mutex) is not defined!"
 #   endif
 # endif
 
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
index 00de15dae24a..342e15691eb9 100755
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -111,14 +111,14 @@ def add_version_header(tc):
     "name": "__cpp_lib_atomic_wait",
     "values": { "c++20": 201907 },
     "headers": ["atomic"],
-    "depends": "!defined(_LIBCPP_HAS_NO_THREADS)",
-    "internal_depends": "!defined(_LIBCPP_HAS_NO_THREADS)",
+    "depends": "!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_atomic_wait)",
+    "internal_depends": "!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_atomic_wait)",
   }, {
     "name": "__cpp_lib_barrier",
     "values": { "c++20": 201907 },
     "headers": ["barrier"],
-    "depends": "!defined(_LIBCPP_HAS_NO_THREADS)",
-    "internal_depends": "!defined(_LIBCPP_HAS_NO_THREADS)",
+    "depends": "!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_barrier)",
+    "internal_depends": "!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_barrier)",
   }, {
     "name": "__cpp_lib_bind_front",
     "values": { "c++20": 201907 },
@@ -270,6 +270,8 @@ def add_version_header(tc):
     "name": "__cpp_lib_filesystem",
     "values": { "c++17": 201703 },
     "headers": ["filesystem"],
+    "depends": "!defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_filesystem)",
+    "internal_depends": "!defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_filesystem)"
   }, {
     "name": "__cpp_lib_gcd_lcm",
     "values": { "c++17": 201606 },
@@ -383,8 +385,8 @@ def add_version_header(tc):
     "name": "__cpp_lib_latch",
     "values": { "c++20": 201907 },
     "headers": ["latch"],
-    "depends": "!defined(_LIBCPP_HAS_NO_THREADS)",
-    "internal_depends": "!defined(_LIBCPP_HAS_NO_THREADS)",
+    "depends": "!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_latch)",
+    "internal_depends": "!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_latch)",
   }, {
     "name": "__cpp_lib_launder",
     "values": { "c++17": 201606 },
@@ -496,14 +498,14 @@ def add_version_header(tc):
     "name": "__cpp_lib_semaphore",
     "values": { "c++20": 201907 },
     "headers": ["semaphore"],
-    "depends": "!defined(_LIBCPP_HAS_NO_THREADS)",
-    "internal_depends": "!defined(_LIBCPP_HAS_NO_THREADS)",
+    "depends": "!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_semaphore)",
+    "internal_depends": "!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_semaphore)",
   }, {
     "name": "__cpp_lib_shared_mutex",
     "values": { "c++17": 201505 },
     "headers": ["shared_mutex"],
-    "depends": "!defined(_LIBCPP_HAS_NO_THREADS)",
-    "internal_depends": "!defined(_LIBCPP_HAS_NO_THREADS)",
+    "depends": "!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_mutex)",
+    "internal_depends": "!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_mutex)",
   }, {
     "name": "__cpp_lib_shared_ptr_arrays",
     "values": { "c++17": 201611 },
@@ -516,8 +518,8 @@ def add_version_header(tc):
     "name": "__cpp_lib_shared_timed_mutex",
     "values": { "c++14": 201402 },
     "headers": ["shared_mutex"],
-    "depends": "!defined(_LIBCPP_HAS_NO_THREADS)",
-    "internal_depends": "!defined(_LIBCPP_HAS_NO_THREADS)",
+    "depends": "!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_timed_mutex)",
+    "internal_depends": "!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_timed_mutex)",
   }, {
     "name": "__cpp_lib_shift",
     "values": { "c++20": 201806 },

From 6c57bab74f6ffc8f40c5a805dedb5deb76c80458 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Wed, 3 Mar 2021 09:55:02 -0800
Subject: [PATCH 300/318] [clang] Don't assert in EmitAggregateCopy on
 trivial_abi types

Fixes PR42961.

Reviewed By: rnk

Differential Revision: https://reviews.llvm.org/D97872

(cherry picked from commit c8227f06b3356cdc9cc757d8888dfb59a6d8ad89)
---
 clang/lib/CodeGen/CGExprAgg.cpp       |  2 +-
 clang/test/CodeGenCXX/trivial_abi.cpp | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index 60ea1b2af037..f3ab91559d30 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -2056,7 +2056,7 @@ void CodeGenFunction::EmitAggregateCopy(LValue Dest, LValue Src, QualType Ty,
               Record->hasTrivialCopyAssignment() ||
               Record->hasTrivialMoveConstructor() ||
               Record->hasTrivialMoveAssignment() ||
-              Record->isUnion()) &&
+              Record->hasAttr<TrivialABIAttr>() || Record->isUnion()) &&
              "Trying to aggregate-copy a type without a trivial copy/move "
              "constructor or assignment operator");
       // Ignore empty classes in C++.
diff --git a/clang/test/CodeGenCXX/trivial_abi.cpp b/clang/test/CodeGenCXX/trivial_abi.cpp
index ac41f5cac086..a4222c100311 100644
--- a/clang/test/CodeGenCXX/trivial_abi.cpp
+++ b/clang/test/CodeGenCXX/trivial_abi.cpp
@@ -262,3 +262,21 @@ void calleeExceptionLarge(Large, Large);
 void testExceptionLarge() {
   calleeExceptionLarge(Large(), Large());
 }
+
+// PR42961
+
+// CHECK: define{{.*}} @"_ZN3$_08__invokeEv"()
+// CHECK: %[[RETVAL:.*]] = alloca %[[STRUCT_SMALL]], align 8
+// CHECK: %[[COERCE:.*]] = alloca %[[STRUCT_SMALL]], align 8
+// CHECK: %[[CALL:.*]] = call{{.*}} @"_ZNK3$_0clEv"
+// CHECK: %[[COERCEDIVE:.*]] = getelementptr{{.*}} %[[COERCE]]
+// CHECK: %[[COERCEVALIP:.*]] = inttoptr{{.*}} %[[CALL]]
+// CHECK: %[[RETVALP:.*]] = bitcast %[[STRUCT_SMALL]]* %[[RETVAL]]
+// CHECK: %[[COERCEP:.*]] = bitcast %[[STRUCT_SMALL]]* %[[COERCE]]
+// CHECK: call {{.*}}memcpy{{.*}} %[[RETVALP]]{{.*}} %[[COERCEP]]
+// CHECK: %[[COERCEDIVE1:.*]] = getelementptr{{.*}} %[[RETVAL]]
+// CHECK: %[[TMP:.*]] = load{{.*}} %[[COERCEDIVE1]]
+// CHECK: %[[COERCEVALPI:.*]] = ptrtoint{{.*}} %[[TMP]]
+// CHECK: ret{{.*}} %[[COERCEVALPI]]
+
+Small (*fp)() = []() -> Small { return Small(); };

From 0eae129baeb589b905223a1e9c8115937bc541b7 Mon Sep 17 00:00:00 2001
From: Amanieu d'Antras <amanieu@gmail.com>
Date: Mon, 12 Apr 2021 18:05:18 +0100
Subject: [PATCH 301/318] [ConstantMerge] Don't merge thread_local constants
 with non-thread_local constants

Fixes PR49932

Reviewed By: mehdi_amini

Differential Revision: https://reviews.llvm.org/D100322

(cherry picked from commit ad9ce8142dd5b90f725ad362feb054d52a35aa1f)
---
 llvm/lib/Transforms/IPO/ConstantMerge.cpp        |  2 ++
 llvm/test/Transforms/ConstantMerge/dont-merge.ll | 12 ++++++++++++
 2 files changed, 14 insertions(+)

diff --git a/llvm/lib/Transforms/IPO/ConstantMerge.cpp b/llvm/lib/Transforms/IPO/ConstantMerge.cpp
index 67f1438b9b6a..8e81f4bad4af 100644
--- a/llvm/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/llvm/lib/Transforms/IPO/ConstantMerge.cpp
@@ -95,6 +95,8 @@ isUnmergeableGlobal(GlobalVariable *GV,
   // Only process constants with initializers in the default address space.
   return !GV->isConstant() || !GV->hasDefinitiveInitializer() ||
          GV->getType()->getAddressSpace() != 0 || GV->hasSection() ||
+         // Don't touch thread-local variables.
+         GV->isThreadLocal() ||
          // Don't touch values marked with attribute(used).
          UsedGlobals.count(GV);
 }
diff --git a/llvm/test/Transforms/ConstantMerge/dont-merge.ll b/llvm/test/Transforms/ConstantMerge/dont-merge.ll
index 21e390785df5..b0dab923d2cb 100644
--- a/llvm/test/Transforms/ConstantMerge/dont-merge.ll
+++ b/llvm/test/Transforms/ConstantMerge/dont-merge.ll
@@ -80,3 +80,15 @@ define void @test4(i32** %P1, i32** %P2, i32** %P3, i32** %P4, i32** %P5, i32**
         store i32* @T4D2, i32** %P8
         ret void
 }
+
+; CHECK: @T5tls
+; CHECK: @T5ua
+
+@T5tls = private thread_local constant i32 555
+@T5ua = private unnamed_addr constant i32 555
+
+define void @test5(i32** %P1, i32** %P2) {
+        store i32* @T5tls, i32** %P1
+        store i32* @T5ua, i32** %P2
+        ret void
+}

From 88c6773026d8e2b013ec2bb1d72593550d18d42f Mon Sep 17 00:00:00 2001
From: Vy Nguyen <vyng@google.com>
Date: Thu, 4 Mar 2021 12:13:07 -0500
Subject: [PATCH 302/318] Reland 293e8fa13d3f05e993771577a4c022deee5cbf6e    
 [llvm-exegesis] Disable the LBR check on AMD

    https://bugs.llvm.org/show_bug.cgi?id=48918

    The bug reported a hang (or very very slow runtime) on a Zen2. Unfortunately, we don't have the hardware right now to debug it and I was not able to reproduce the bug on a HSW.
    Theory we've got is that the lbr-checking code could be confused on AMD.

    Differential Revision: https://reviews.llvm.org/D97504

New change:
 - Surround usages of x86 helper in llvm-exegesis/X86/Target.cpp with ifdef
 - Fix bug which caused the caller of getVendorSignature to not have a copy of EAX that it expected.

(cherry picked from commit f8b01d54c31547f3ecb25fb8a1912183026bffa7)
---
 llvm/include/llvm/Support/Host.h            | 14 +++++
 llvm/lib/Support/Host.cpp                   | 68 +++++++++++++++++----
 llvm/tools/llvm-exegesis/lib/X86/Target.cpp | 19 +++++-
 3 files changed, 85 insertions(+), 16 deletions(-)

diff --git a/llvm/include/llvm/Support/Host.h b/llvm/include/llvm/Support/Host.h
index d4ef389450cc..b3c15f0683b9 100644
--- a/llvm/include/llvm/Support/Host.h
+++ b/llvm/include/llvm/Support/Host.h
@@ -65,6 +65,20 @@ namespace sys {
   StringRef getHostCPUNameForARM(StringRef ProcCpuinfoContent);
   StringRef getHostCPUNameForS390x(StringRef ProcCpuinfoContent);
   StringRef getHostCPUNameForBPF();
+
+  /// Helper functions to extract CPU details from CPUID on x86.
+  namespace x86 {
+  enum class VendorSignatures {
+    UNKNOWN,
+    GENUINE_INTEL,
+    AUTHENTIC_AMD,
+  };
+
+  /// Returns the host CPU's vendor.
+  /// MaxLeaf: if a non-nullptr pointer is specified, the EAX value will be
+  /// assigned to its pointee.
+  VendorSignatures getVendorSignature(unsigned *MaxLeaf = nullptr);
+  } // namespace x86
   }
 }
 }
diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp
index a1bd3cc12f1d..79482dfacd29 100644
--- a/llvm/lib/Support/Host.cpp
+++ b/llvm/lib/Support/Host.cpp
@@ -417,11 +417,6 @@ StringRef sys::detail::getHostCPUNameForBPF() {
 #if defined(__i386__) || defined(_M_IX86) || \
     defined(__x86_64__) || defined(_M_X64)
 
-enum VendorSignatures {
-  SIG_INTEL = 0x756e6547 /* Genu */,
-  SIG_AMD = 0x68747541 /* Auth */
-};
-
 // The check below for i386 was copied from clang's cpuid.h (__get_cpuid_max).
 // Check motivated by bug reports for OpenSSL crashing on CPUs without CPUID
 // support. Consequently, for i386, the presence of CPUID is checked first
@@ -495,6 +490,42 @@ static bool getX86CpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
 #endif
 }
 
+namespace llvm {
+namespace sys {
+namespace detail {
+namespace x86 {
+
+VendorSignatures getVendorSignature(unsigned *MaxLeaf) {
+  unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
+  if (MaxLeaf == nullptr)
+    MaxLeaf = &EAX;
+  else
+    *MaxLeaf = 0;
+
+  if (!isCpuIdSupported())
+    return VendorSignatures::UNKNOWN;
+
+  if (getX86CpuIDAndInfo(0, MaxLeaf, &EBX, &ECX, &EDX) || *MaxLeaf < 1)
+    return VendorSignatures::UNKNOWN;
+
+  // "Genu ineI ntel"
+  if (EBX == 0x756e6547 && ECX == 0x6c65746e && EDX == 0x49656e69)
+    return VendorSignatures::GENUINE_INTEL;
+
+  // "Auth enti cAMD"
+  if (EBX == 0x68747541 && ECX == 0x69746e65 && EDX == 0x444d4163)
+    return VendorSignatures::AUTHENTIC_AMD;
+
+  return VendorSignatures::UNKNOWN;
+}
+
+} // namespace x86
+} // namespace detail
+} // namespace sys
+} // namespace llvm
+
+using namespace llvm::sys::detail::x86;
+
 /// getX86CpuIDAndInfoEx - Execute the specified cpuid with subleaf and return
 /// the 4 values in the specified arguments.  If we can't run cpuid on the host,
 /// return true.
@@ -1092,14 +1123,12 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
 }
 
 StringRef sys::getHostCPUName() {
-  unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
-  unsigned MaxLeaf, Vendor;
-
-  if (!isCpuIdSupported())
+  unsigned MaxLeaf = 0;
+  const VendorSignatures Vendor = getVendorSignature(&MaxLeaf);
+  if (Vendor == VendorSignatures::UNKNOWN)
     return "generic";
 
-  if (getX86CpuIDAndInfo(0, &MaxLeaf, &Vendor, &ECX, &EDX) || MaxLeaf < 1)
-    return "generic";
+  unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
   getX86CpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX);
 
   unsigned Family = 0, Model = 0;
@@ -1114,10 +1143,10 @@ StringRef sys::getHostCPUName() {
 
   StringRef CPU;
 
-  if (Vendor == SIG_INTEL) {
+  if (Vendor == VendorSignatures::GENUINE_INTEL) {
     CPU = getIntelProcessorTypeAndSubtype(Family, Model, Features, &Type,
                                           &Subtype);
-  } else if (Vendor == SIG_AMD) {
+  } else if (Vendor == VendorSignatures::AUTHENTIC_AMD) {
     CPU = getAMDProcessorTypeAndSubtype(Family, Model, Features, &Type,
                                         &Subtype);
   }
@@ -1219,6 +1248,19 @@ StringRef sys::getHostCPUName() {
 }
 #else
 StringRef sys::getHostCPUName() { return "generic"; }
+namespace llvm {
+namespace sys {
+namespace detail {
+namespace x86 {
+
+VendorSignatures getVendorSignature(unsigned *MaxLeaf) {
+  return VendorSignatures::UNKNOWN;
+}
+
+} // namespace x86
+} // namespace detail
+} // namespace sys
+} // namespace llvm
 #endif
 
 #if defined(__linux__) && (defined(__i386__) || defined(__x86_64__))
diff --git a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
index 15fa54e2f6a2..8839f00a69de 100644
--- a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/Host.h"
 
 #include <memory>
 #include <string>
@@ -727,13 +728,25 @@ class ExegesisX86Target : public ExegesisTarget {
 
 #if defined(__linux__) && defined(HAVE_LIBPFM) &&                              \
     defined(LIBPFM_HAS_FIELD_CYCLES)
-    // If the kernel supports it, the hardware still may not have it.
-    return X86LbrCounter::checkLbrSupport();
+      // FIXME: Fix this.
+      // https://bugs.llvm.org/show_bug.cgi?id=48918
+      // For now, only do the check if we see an Intel machine because
+      // the counter uses some intel-specific magic and it could
+      // be confuse and think an AMD machine actually has LBR support.
+#if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) ||            \
+    defined(_M_X64)
+    using namespace sys::detail::x86;
+
+    if (getVendorSignature() == VendorSignatures::GENUINE_INTEL)
+      // If the kernel supports it, the hardware still may not have it.
+      return X86LbrCounter::checkLbrSupport();
 #else
+    llvm_unreachable("Running X86 exegesis on non-X86 target");
+#endif
+#endif
     return llvm::make_error<llvm::StringError>(
         "LBR not supported on this kernel and/or platform",
         llvm::errc::not_supported);
-#endif
   }
 
   std::unique_ptr<SavedState> withSavedState() const override {

From 1a9f4b3a3890786dbee252e542338f906da98b3c Mon Sep 17 00:00:00 2001
From: Vy Nguyen <vyng@google.com>
Date: Wed, 10 Mar 2021 02:41:58 -0500
Subject: [PATCH 303/318] [llvm] Fix thinko in getVendorSignature(), where
 expected values of  ECX and EDX were flipped for the AMD case.

Follow up to D97504

Differential Revision: https://reviews.llvm.org/D98322

(cherry picked from commit 64d2c326b7f01942f0179fb797070e5cefbba303)
---
 llvm/lib/Support/Host.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp
index 79482dfacd29..09146c47ff2c 100644
--- a/llvm/lib/Support/Host.cpp
+++ b/llvm/lib/Support/Host.cpp
@@ -509,11 +509,11 @@ VendorSignatures getVendorSignature(unsigned *MaxLeaf) {
     return VendorSignatures::UNKNOWN;
 
   // "Genu ineI ntel"
-  if (EBX == 0x756e6547 && ECX == 0x6c65746e && EDX == 0x49656e69)
+  if (EBX == 0x756e6547 && EDX == 0x49656e69 && ECX == 0x6c65746e)
     return VendorSignatures::GENUINE_INTEL;
 
   // "Auth enti cAMD"
-  if (EBX == 0x68747541 && ECX == 0x69746e65 && EDX == 0x444d4163)
+  if (EBX == 0x68747541 && EDX == 0x69746e65 && ECX == 0x444d4163)
     return VendorSignatures::AUTHENTIC_AMD;
 
   return VendorSignatures::UNKNOWN;

From edd770b4bc99eb83dcf6ec1d2af92da42de2c575 Mon Sep 17 00:00:00 2001
From: Xun Li <lxfind@gmail.com>
Date: Thu, 17 Jun 2021 19:06:10 -0700
Subject: [PATCH 304/318] [Coroutine] Properly deal with byval and noalias
 parameters

This patch is to address https://bugs.llvm.org/show_bug.cgi?id=48857.
Previous attempts can be found in D104007 and D101980.
A lot of discussions can be found in those two patches.
To summarize the bug:
When Clang emits IR for coroutines, the first thing it does is to make a copy of every argument to the local stack, so that uses of the arguments in the function will all refer to the local copies instead of the arguments directly.
However, in some cases we find that arguments are still directly used:
When Clang emits IR for a function that has pass-by-value arguments, sometimes it emits an argument with byval attribute. A byval attribute is considered to be local to the function (just like alloca) and hence it can be easily determined that it does not alias other values. If in the IR there exists a memcpy from a byval argument to a local alloca, and then from that local alloca to another alloca, MemCpyOpt will optimize out the first memcpy because byval argument's content will not change. This causes issues because after a coroutine suspension, the byval argument may die outside of the function, and latter uses will lead to memory use-after-free.
This is only a problem for arguments with either byval attribute or noalias attribute, because only these two kinds are considered local. Arguments without these two attributes will be considered to alias coro_suspend and hence we won't have this problem. So we need to be able to deal with these two attributes in coroutines properly.
For noalias arguments, since coro_suspend may potentially change the value of any argument outside of the function, we simply shouldn't mark any argument in a coroutiune as noalias. This can be taken care of in CoroEarly pass.
For byval arguments, if such an argument needs to live across suspensions, we will have to copy their value content to the frame, not just the pointer.

Differential Revision: https://reviews.llvm.org/D104184

(cherry picked from commit 3522167efd80e2fef42a865cdf7481d60d062603)
---
 llvm/lib/Transforms/Coroutines/CoroEarly.cpp  |   9 ++
 llvm/lib/Transforms/Coroutines/CoroFrame.cpp  |  31 ++++-
 .../Transforms/Coroutines/coro-byval-param.ll | 127 ++++++++++++++++++
 .../Coroutines/coro-noalias-param.ll          |  40 ++++++
 4 files changed, 202 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/Transforms/Coroutines/coro-byval-param.ll
 create mode 100644 llvm/test/Transforms/Coroutines/coro-noalias-param.ll

diff --git a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
index 1660e41ba830..5e5e513cdfda 100644
--- a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
@@ -149,6 +149,7 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) {
   bool Changed = false;
   CoroIdInst *CoroId = nullptr;
   SmallVector<CoroFreeInst *, 4> CoroFrees;
+  bool HasCoroSuspend = false;
   for (auto IB = inst_begin(F), IE = inst_end(F); IB != IE;) {
     Instruction &I = *IB++;
     if (auto *CB = dyn_cast<CallBase>(&I)) {
@@ -163,6 +164,7 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) {
         // pass expects that there is at most one final suspend point.
         if (cast<CoroSuspendInst>(&I)->isFinal())
           CB->setCannotDuplicate();
+        HasCoroSuspend = true;
         break;
       case Intrinsic::coro_end_async:
       case Intrinsic::coro_end:
@@ -213,6 +215,13 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) {
   if (CoroId)
     for (CoroFreeInst *CF : CoroFrees)
       CF->setArgOperand(0, CoroId);
+  // Coroutine suspention could potentially lead to any argument modified
+  // outside of the function, hence arguments should not have noalias
+  // attributes.
+  if (HasCoroSuspend)
+    for (Argument &A : F.args())
+      if (A.hasNoAliasAttr())
+        A.removeAttr(Attribute::NoAlias);
   return Changed;
 }
 
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index e53e7605b254..e1e0d50979dc 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -781,7 +781,13 @@ static StructType *buildFrameType(Function &F, coro::Shape &Shape,
         PromiseAlloca, DenseMap<Instruction *, llvm::Optional<APInt>>{}, false);
   // Create an entry for every spilled value.
   for (auto &S : FrameData.Spills) {
-    FieldIDType Id = B.addField(S.first->getType(), None);
+    Type *FieldType = S.first->getType();
+    // For byval arguments, we need to store the pointed value in the frame,
+    // instead of the pointer itself.
+    if (const Argument *A = dyn_cast<Argument>(S.first))
+      if (A->hasByValAttr())
+        FieldType = FieldType->getPointerElementType();
+    FieldIDType Id = B.addField(FieldType, None);
     FrameData.setFieldIndex(S.first, Id);
   }
 
@@ -1149,6 +1155,7 @@ static Instruction *insertSpills(const FrameDataInfo &FrameData,
     // Create a store instruction storing the value into the
     // coroutine frame.
     Instruction *InsertPt = nullptr;
+    bool NeedToCopyArgPtrValue = false;
     if (auto *Arg = dyn_cast<Argument>(Def)) {
       // For arguments, we will place the store instruction right after
       // the coroutine frame pointer instruction, i.e. bitcast of
@@ -1159,6 +1166,9 @@ static Instruction *insertSpills(const FrameDataInfo &FrameData,
       // from the coroutine function.
       Arg->getParent()->removeParamAttr(Arg->getArgNo(), Attribute::NoCapture);
 
+      if (Arg->hasByValAttr())
+        NeedToCopyArgPtrValue = true;
+
     } else if (auto *CSI = dyn_cast<AnyCoroSuspendInst>(Def)) {
       // Don't spill immediately after a suspend; splitting assumes
       // that the suspend will be followed by a branch.
@@ -1193,7 +1203,15 @@ static Instruction *insertSpills(const FrameDataInfo &FrameData,
     Builder.SetInsertPoint(InsertPt);
     auto *G = Builder.CreateConstInBoundsGEP2_32(
         FrameTy, FramePtr, 0, Index, Def->getName() + Twine(".spill.addr"));
-    Builder.CreateStore(Def, G);
+    if (NeedToCopyArgPtrValue) {
+      // For byval arguments, we need to store the pointed value in the frame,
+      // instead of the pointer itself.
+      auto *Value =
+          Builder.CreateLoad(Def->getType()->getPointerElementType(), Def);
+      Builder.CreateStore(Value, G);
+    } else {
+      Builder.CreateStore(Def, G);
+    }
 
     BasicBlock *CurrentBlock = nullptr;
     Value *CurrentReload = nullptr;
@@ -1207,9 +1225,12 @@ static Instruction *insertSpills(const FrameDataInfo &FrameData,
 
         auto *GEP = GetFramePointer(E.first);
         GEP->setName(E.first->getName() + Twine(".reload.addr"));
-        CurrentReload = Builder.CreateLoad(
-            FrameTy->getElementType(FrameData.getFieldIndex(E.first)), GEP,
-            E.first->getName() + Twine(".reload"));
+        if (NeedToCopyArgPtrValue)
+          CurrentReload = GEP;
+        else
+          CurrentReload = Builder.CreateLoad(
+              FrameTy->getElementType(FrameData.getFieldIndex(E.first)), GEP,
+              E.first->getName() + Twine(".reload"));
 
         TinyPtrVector<DbgDeclareInst *> DIs = FindDbgDeclareUses(Def);
         for (DbgDeclareInst *DDI : DIs) {
diff --git a/llvm/test/Transforms/Coroutines/coro-byval-param.ll b/llvm/test/Transforms/Coroutines/coro-byval-param.ll
new file mode 100644
index 000000000000..6c3c4582fc8b
--- /dev/null
+++ b/llvm/test/Transforms/Coroutines/coro-byval-param.ll
@@ -0,0 +1,127 @@
+; RUN: opt < %s -passes=coro-split -S | FileCheck %s
+%promise_type = type { i8 }
+%struct.A = type <{ i64, i64, i32, [4 x i8] }>
+
+; Function Attrs: noinline ssp uwtable mustprogress
+define %promise_type* @foo(%struct.A* nocapture readonly byval(%struct.A) align 8 %a1) #0 {
+entry:
+  %__promise = alloca %promise_type, align 1
+  %a2 = alloca %struct.A, align 8
+  %0 = getelementptr inbounds %promise_type, %promise_type* %__promise, i64 0, i32 0
+  %1 = call token @llvm.coro.id(i32 16, i8* nonnull %0, i8* bitcast (%promise_type* (%struct.A*)* @foo to i8*), i8* null)
+  %2 = call i1 @llvm.coro.alloc(token %1)
+  br i1 %2, label %coro.alloc, label %coro.init
+
+coro.alloc:                                       ; preds = %entry
+  %3 = call i64 @llvm.coro.size.i64()
+  %call = call noalias nonnull i8* @_Znwm(i64 %3) #9
+  br label %coro.init
+
+coro.init:                                        ; preds = %coro.alloc, %entry
+  %4 = phi i8* [ null, %entry ], [ %call, %coro.alloc ]
+  %5 = call i8* @llvm.coro.begin(token %1, i8* %4) #10
+  %6 = bitcast %struct.A* %a1 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* nonnull %0) #2
+  %call2 = call %promise_type* @_ZN4task12promise_type17get_return_objectEv(%promise_type* nonnull dereferenceable(1) %__promise)
+  call void @initial_suspend(%promise_type* nonnull dereferenceable(1) %__promise)
+  %7 = call token @llvm.coro.save(i8* null)
+  call fastcc void @_ZNSt12experimental13coroutines_v116coroutine_handleIN4task12promise_typeEE12from_addressEPv(i8* %5) #2
+  %8 = call i8 @llvm.coro.suspend(token %7, i1 false)
+  switch i8 %8, label %coro.ret [
+    i8 0, label %init.ready
+    i8 1, label %cleanup33
+  ]
+
+init.ready:                                       ; preds = %coro.init
+  %9 = bitcast %struct.A* %a2 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 24, i8* nonnull %9) #2
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %9, i8* align 8 %6, i64 24, i1 false)
+  call void @llvm.lifetime.end.p0i8(i64 24, i8* nonnull %9) #2
+  call void @_ZN4task12promise_type13final_suspendEv(%promise_type* nonnull dereferenceable(1) %__promise) #2
+  %10 = call token @llvm.coro.save(i8* null)
+  call fastcc void @_ZNSt12experimental13coroutines_v116coroutine_handleIN4task12promise_typeEE12from_addressEPv(i8* %5) #2
+  %11 = call i8 @llvm.coro.suspend(token %10, i1 true) #10
+  %switch = icmp ult i8 %11, 2
+  br i1 %switch, label %cleanup33, label %coro.ret
+
+cleanup33:                                        ; preds = %init.ready, %coro.init
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* nonnull %0) #2
+  %12 = call i8* @llvm.coro.free(token %1, i8* %5)
+  %.not = icmp eq i8* %12, null
+  br i1 %.not, label %coro.ret, label %coro.free
+
+coro.free:                                        ; preds = %cleanup33
+  call void @_ZdlPv(i8* nonnull %12) #2
+  br label %coro.ret
+
+coro.ret:                                         ; preds = %coro.free, %cleanup33, %init.ready, %coro.init
+  %13 = call i1 @llvm.coro.end(i8* null, i1 false) #10
+  ret %promise_type* %call2
+}
+
+; check that the frame contains the entire struct, instead of just the struct pointer
+; CHECK: %foo.Frame = type { void (%foo.Frame*)*, void (%foo.Frame*)*, %promise_type, %struct.A, i1 }
+
+; Function Attrs: argmemonly nounwind readonly
+declare token @llvm.coro.id(i32, i8* readnone, i8* nocapture readonly, i8*) #1
+
+; Function Attrs: nounwind
+declare i1 @llvm.coro.alloc(token) #2
+
+; Function Attrs: nobuiltin nofree allocsize(0)
+declare nonnull i8* @_Znwm(i64) local_unnamed_addr #3
+
+; Function Attrs: nounwind readnone
+declare i64 @llvm.coro.size.i64() #4
+
+; Function Attrs: nounwind
+declare i8* @llvm.coro.begin(token, i8* writeonly) #2
+
+; Function Attrs: argmemonly nofree nosync nounwind willreturn
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #5
+
+; Function Attrs: argmemonly nofree nounwind willreturn
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #6
+
+; Function Attrs: noinline nounwind ssp uwtable willreturn mustprogress
+declare %promise_type* @_ZN4task12promise_type17get_return_objectEv(%promise_type* nonnull dereferenceable(1)) local_unnamed_addr #7 align 2
+
+; Function Attrs: noinline nounwind ssp uwtable willreturn mustprogress
+declare void @initial_suspend(%promise_type* nonnull dereferenceable(1)) local_unnamed_addr #7 align 2
+
+; Function Attrs: nounwind
+declare token @llvm.coro.save(i8*) #2
+
+; Function Attrs: noinline nounwind ssp uwtable willreturn mustprogress
+declare hidden fastcc void @_ZNSt12experimental13coroutines_v116coroutine_handleIN4task12promise_typeEE12from_addressEPv(i8*) unnamed_addr #7 align 2
+
+; Function Attrs: argmemonly nofree nosync nounwind willreturn
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #5
+
+; Function Attrs: nounwind
+declare i8 @llvm.coro.suspend(token, i1) #2
+
+; Function Attrs: noinline nounwind ssp uwtable willreturn mustprogress
+declare void @_ZN4task12promise_type13final_suspendEv(%promise_type* nonnull dereferenceable(1)) local_unnamed_addr #7 align 2
+
+; Function Attrs: nounwind
+declare i1 @llvm.coro.end(i8*, i1) #2
+
+; Function Attrs: nobuiltin nounwind
+declare void @_ZdlPv(i8*) local_unnamed_addr #8
+
+; Function Attrs: argmemonly nounwind readonly
+declare i8* @llvm.coro.free(token, i8* nocapture readonly) #1
+
+attributes #0 = { noinline ssp uwtable mustprogress "coroutine.presplit"="1" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "tune-cpu"="generic" }
+attributes #1 = { argmemonly nounwind readonly }
+attributes #2 = { nounwind }
+attributes #3 = { nobuiltin nofree allocsize(0) "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "tune-cpu"="generic" }
+attributes #4 = { nounwind readnone }
+attributes #5 = { argmemonly nofree nosync nounwind willreturn }
+attributes #6 = { argmemonly nofree nounwind willreturn }
+attributes #7 = { noinline nounwind ssp uwtable willreturn mustprogress "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "tune-cpu"="generic" }
+attributes #8 = { nobuiltin nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "tune-cpu"="generic" }
+attributes #9 = { allocsize(0) }
+attributes #10 = { noduplicate }
+
diff --git a/llvm/test/Transforms/Coroutines/coro-noalias-param.ll b/llvm/test/Transforms/Coroutines/coro-noalias-param.ll
new file mode 100644
index 000000000000..0b9a70ad0366
--- /dev/null
+++ b/llvm/test/Transforms/Coroutines/coro-noalias-param.ll
@@ -0,0 +1,40 @@
+; RUN: opt < %s -S -passes=coro-early | FileCheck %s
+%struct.A = type <{ i64, i64, i32, [4 x i8] }>
+
+define void @f(%struct.A* nocapture readonly noalias align 8 %a) {
+  %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
+  %size = call i32 @llvm.coro.size.i32()
+  %alloc = call i8* @malloc(i32 %size)
+  %hdl = call i8* @llvm.coro.begin(token %id, i8* %alloc)
+  call void @print(i32 0)
+  %s1 = call i8 @llvm.coro.suspend(token none, i1 false)
+  switch i8 %s1, label %suspend [i8 0, label %resume 
+                                 i8 1, label %cleanup]
+resume:
+  call void @print(i32 1)
+  br label %cleanup
+
+cleanup:
+  %mem = call i8* @llvm.coro.free(token %id, i8* %hdl)
+  call void @free(i8* %mem)
+  br label %suspend
+suspend:
+  call i1 @llvm.coro.end(i8* %hdl, i1 0)
+  ret void  
+}
+
+; check that the noalias attribute is removed from the argument
+; CHECK: define void @f(%struct.A* nocapture readonly align 8 %a)
+
+declare token @llvm.coro.id(i32, i8*, i8*, i8*)
+declare i8* @llvm.coro.begin(token, i8*)
+declare i8* @llvm.coro.free(token, i8*)
+declare i32 @llvm.coro.size.i32()
+declare i8  @llvm.coro.suspend(token, i1)
+declare void @llvm.coro.resume(i8*)
+declare void @llvm.coro.destroy(i8*)
+declare i1 @llvm.coro.end(i8*, i1) 
+
+declare noalias i8* @malloc(i32)
+declare void @print(i32)
+declare void @free(i8*)

From 02b775a5efb6127e289bb00d91b88a303d51c85a Mon Sep 17 00:00:00 2001
From: Jan Kratochvil <jan.kratochvil@redhat.com>
Date: Tue, 9 Mar 2021 08:31:23 +0100
Subject: [PATCH 305/318] [nfc] llvm-dwarfdump:
 DWARFAbbreviationDeclaration::AttributeSpec -> DWARFAttribute

`AttributeSpec` does not contain values while `DWARFAttribute` already
does. Therefore one no longer needs to pass `uint64_t *OffsetPtr`.

Differential Revision: https://reviews.llvm.org/D98194

(cherry picked from commit ba8907bf6f2cf5ca2f6d92a6dfe7bc9cf74f003f)
---
 llvm/lib/DebugInfo/DWARF/DWARFDie.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
index 427f6f4942c3..c2760dd02471 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -69,7 +69,7 @@ static void dumpRanges(const DWARFObject &Obj, raw_ostream &OS,
   }
 }
 
-static void dumpLocation(raw_ostream &OS, DWARFFormValue &FormValue,
+static void dumpLocation(raw_ostream &OS, const DWARFFormValue &FormValue,
                          DWARFUnit *U, unsigned Indent,
                          DIDumpOptions DumpOpts) {
   DWARFContext &Ctx = U->getContext();
@@ -230,21 +230,22 @@ static void dumpTypeName(raw_ostream &OS, const DWARFDie &D) {
 }
 
 static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
-                          uint64_t *OffsetPtr, dwarf::Attribute Attr,
-                          dwarf::Form Form, unsigned Indent,
+                          const DWARFAttribute &AttrValue, unsigned Indent,
                           DIDumpOptions DumpOpts) {
   if (!Die.isValid())
     return;
   const char BaseIndent[] = "            ";
   OS << BaseIndent;
   OS.indent(Indent + 2);
+  dwarf::Attribute Attr = AttrValue.Attr;
   WithColor(OS, HighlightColor::Attribute) << formatv("{0}", Attr);
 
+  dwarf::Form Form = AttrValue.Value.getForm();
   if (DumpOpts.Verbose || DumpOpts.ShowForm)
     OS << formatv(" [{0}]", Form);
 
   DWARFUnit *U = Die.getDwarfUnit();
-  DWARFFormValue FormValue = DWARFFormValue::createFromUnit(Form, U, OffsetPtr);
+  const DWARFFormValue &FormValue = AttrValue.Value;
 
   OS << "\t(";
 
@@ -631,15 +632,14 @@ void DWARFDie::dump(raw_ostream &OS, unsigned Indent,
         OS << '\n';
 
         // Dump all data in the DIE for the attributes.
-        for (const auto &AttrSpec : AbbrevDecl->attributes()) {
-          if (AttrSpec.Form == DW_FORM_implicit_const) {
+        for (const DWARFAttribute &AttrValue : attributes()) {
+          if (AttrValue.Value.getForm() == DW_FORM_implicit_const) {
             // We are dumping .debug_info section ,
             // implicit_const attribute values are not really stored here,
             // but in .debug_abbrev section. So we just skip such attrs.
             continue;
           }
-          dumpAttribute(OS, *this, &offset, AttrSpec.Attr, AttrSpec.Form,
-                        Indent, DumpOpts);
+          dumpAttribute(OS, *this, AttrValue, Indent, DumpOpts);
         }
 
         DWARFDie child = getFirstChild();

From e8a397203c67adbeae04763ce25c6a5ae76af52c Mon Sep 17 00:00:00 2001
From: Jan Kratochvil <jan.kratochvil@redhat.com>
Date: Tue, 9 Mar 2021 09:26:17 +0100
Subject: [PATCH 306/318] llvm-dwarfdump: Fix DWARF-5 DW_FORM_implicit_const
 (used by GCC)

Differential Revision: https://reviews.llvm.org/D98195

(cherry picked from commit 4289a7f1d78972e9f1fa173c8ee0f6b8b45223d7)
---
 .../DWARF/DWARFAbbreviationDeclaration.h      | 10 ++++++
 llvm/lib/DebugInfo/DWARF/DWARFDie.cpp         | 23 ++++++-------
 llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp   |  5 +++
 llvm/test/DebugInfo/implicit-const-test2.s    | 34 +++++++++++++++++++
 4 files changed, 60 insertions(+), 12 deletions(-)
 create mode 100644 llvm/test/DebugInfo/implicit-const-test2.s

diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h
index 39ae53c4e7fe..cf4c827b9267 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h
@@ -111,6 +111,16 @@ class DWARFAbbreviationDeclaration {
     return AttributeSpecs[idx].Attr;
   }
 
+  bool getAttrIsImplicitConstByIndex(uint32_t idx) const {
+    assert(idx < AttributeSpecs.size());
+    return AttributeSpecs[idx].isImplicitConst();
+  }
+
+  int64_t getAttrImplicitConstValueByIndex(uint32_t idx) const {
+    assert(idx < AttributeSpecs.size());
+    return AttributeSpecs[idx].getImplicitConstValue();
+  }
+
   /// Get the index of the specified attribute.
   ///
   /// Searches the this abbreviation declaration for the index of the specified
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
index c2760dd02471..5a55f3a04148 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -632,15 +632,8 @@ void DWARFDie::dump(raw_ostream &OS, unsigned Indent,
         OS << '\n';
 
         // Dump all data in the DIE for the attributes.
-        for (const DWARFAttribute &AttrValue : attributes()) {
-          if (AttrValue.Value.getForm() == DW_FORM_implicit_const) {
-            // We are dumping .debug_info section ,
-            // implicit_const attribute values are not really stored here,
-            // but in .debug_abbrev section. So we just skip such attrs.
-            continue;
-          }
+        for (const DWARFAttribute &AttrValue : attributes())
           dumpAttribute(OS, *this, AttrValue, Indent, DumpOpts);
-        }
 
         DWARFDie child = getFirstChild();
         if (DumpOpts.ShowChildren && DumpOpts.ChildRecurseDepth > 0 && child) {
@@ -723,10 +716,16 @@ void DWARFDie::attribute_iterator::updateForIndex(
     // Add the previous byte size of any previous attribute value.
     AttrValue.Offset += AttrValue.ByteSize;
     uint64_t ParseOffset = AttrValue.Offset;
-    auto U = Die.getDwarfUnit();
-    assert(U && "Die must have valid DWARF unit");
-    AttrValue.Value = DWARFFormValue::createFromUnit(
-        AbbrDecl.getFormByIndex(Index), U, &ParseOffset);
+    if (AbbrDecl.getAttrIsImplicitConstByIndex(Index))
+      AttrValue.Value = DWARFFormValue::createFromSValue(
+          AbbrDecl.getFormByIndex(Index),
+          AbbrDecl.getAttrImplicitConstValueByIndex(Index));
+    else {
+      auto U = Die.getDwarfUnit();
+      assert(U && "Die must have valid DWARF unit");
+      AttrValue.Value = DWARFFormValue::createFromUnit(
+          AbbrDecl.getFormByIndex(Index), U, &ParseOffset);
+    }
     AttrValue.ByteSize = ParseOffset - AttrValue.Offset;
   } else {
     assert(Index == NumAttrs && "Indexes should be [0, NumAttrs) only");
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp b/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp
index 7a84605211fb..2559765876d9 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp
@@ -168,6 +168,7 @@ bool DWARFFormValue::skipValue(dwarf::Form Form, DataExtractor DebugInfoData,
     case DW_FORM_line_strp:
     case DW_FORM_GNU_ref_alt:
     case DW_FORM_GNU_strp_alt:
+    case DW_FORM_implicit_const:
       if (Optional<uint8_t> FixedSize =
               dwarf::getFixedFormByteSize(Form, Params)) {
         *OffsetPtr += *FixedSize;
@@ -345,6 +346,9 @@ bool DWARFFormValue::extractValue(const DWARFDataExtractor &Data,
     case DW_FORM_ref_sig8:
       Value.uval = Data.getU64(OffsetPtr, &Err);
       break;
+    case DW_FORM_implicit_const:
+      // Value has been already set by DWARFFormValue::createFromSValue.
+      break;
     default:
       // DWARFFormValue::skipValue() will have caught this and caused all
       // DWARF DIEs to fail to be parsed, so this code is not be reachable.
@@ -482,6 +486,7 @@ void DWARFFormValue::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const {
     break;
 
   case DW_FORM_sdata:
+  case DW_FORM_implicit_const:
     OS << Value.sval;
     break;
   case DW_FORM_udata:
diff --git a/llvm/test/DebugInfo/implicit-const-test2.s b/llvm/test/DebugInfo/implicit-const-test2.s
new file mode 100644
index 000000000000..dd50592afde9
--- /dev/null
+++ b/llvm/test/DebugInfo/implicit-const-test2.s
@@ -0,0 +1,34 @@
+# REQUIRES: x86-registered-target
+
+# RUN: llvm-mc --filetype=obj --triple=x86_64-pc-linux %s -o %t.o -g
+
+# RUN: llvm-dwarfdump -v %t.o | FileCheck %s
+
+# CHECK:      [1] DW_TAG_compile_unit	DW_CHILDREN_no
+# CHECK-NEXT: DW_AT_language	DW_FORM_implicit_const	29
+
+# CHECK:      0x0000000c: DW_TAG_compile_unit [1]  
+# CHECK-NEXT: DW_AT_language [DW_FORM_implicit_const]	(DW_LANG_C11)
+
+	.section	.debug_info,"",@progbits
+.Ldebug_info0:
+	.long	.Ldebug_info0_end - .Ldebug_info0_start	# Length of Compilation Unit Info
+.Ldebug_info0_start:
+	.value	0x5	# DWARF version number
+	.byte	0x1	# DW_UT_compile
+	.byte	0x8	# Pointer Size (in bytes)
+	.long	.Ldebug_abbrev0	# Offset Into Abbrev. Section
+	.uleb128 0x1	# (DIE DW_TAG_compile_unit)
+			# DW_AT_language
+.Ldebug_info0_end:
+	.section	.debug_abbrev,"",@progbits
+.Ldebug_abbrev0:
+	.uleb128 0x1	# (abbrev code)
+	.uleb128 0x11	# (TAG: DW_TAG_compile_unit)
+	.byte	0x0	# DW_children_no
+	.uleb128 0x13	# (DW_AT_language)
+	.uleb128 0x21   # (DW_FORM_implicit_const)
+	.sleb128 0x1d
+	.byte	0
+	.byte	0
+	.byte	0

From b6ff4dd2e99e86390321fdd43a22c93d0659fe2a Mon Sep 17 00:00:00 2001
From: Sean Fertile <sd.fertile@gmail.com>
Date: Wed, 17 Feb 2021 09:22:43 -0500
Subject: [PATCH 307/318] [PowerPC] Handle FP physical register in inline asm
 constraint.

Do not defer to the base class when the register constraint is a
physical fpr. The base class will select SPILLTOVSRRC as the register
class and register allocation will fail on subtargets without VSX
registers.

Differential Revision: https://reviews.llvm.org/D91629

(cherry picked from commit 4e127bce2d1133ba95a551d69bd0e8fc3b4f9e71)
---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   | 41 ++++++++++++++-----
 .../PowerPC/inline-asm-physical-fpr-spe.ll    | 23 +++++++++++
 .../PowerPC/inline-asm-physical-fpr.ll        | 26 ++++++++++++
 3 files changed, 80 insertions(+), 10 deletions(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/inline-asm-physical-fpr-spe.ll
 create mode 100644 llvm/test/CodeGen/PowerPC/inline-asm-physical-fpr.ll

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 7833bfc1d1b6..26dc3afc899e 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -15154,17 +15154,38 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       return std::make_pair(0U, &PPC::LRRCRegClass);
   }
 
-  // If we name a VSX register, we can't defer to the base class because it
-  // will not recognize the correct register (their names will be VSL{0-31}
-  // and V{0-31} so they won't match). So we match them here.
-  if (Constraint.size() > 3 && Constraint[1] == 'v' && Constraint[2] == 's') {
-    int VSNum = atoi(Constraint.data() + 3);
-    assert(VSNum >= 0 && VSNum <= 63 &&
-           "Attempted to access a vsr out of range");
-    if (VSNum < 32)
-      return std::make_pair(PPC::VSL0 + VSNum, &PPC::VSRCRegClass);
-    return std::make_pair(PPC::V0 + VSNum - 32, &PPC::VSRCRegClass);
+  // Handle special cases of physical registers that are not properly handled
+  // by the base class.
+  if (Constraint[0] == '{' && Constraint[Constraint.size() - 1] == '}') {
+    // If we name a VSX register, we can't defer to the base class because it
+    // will not recognize the correct register (their names will be VSL{0-31}
+    // and V{0-31} so they won't match). So we match them here.
+    if (Constraint.size() > 3 && Constraint[1] == 'v' && Constraint[2] == 's') {
+      int VSNum = atoi(Constraint.data() + 3);
+      assert(VSNum >= 0 && VSNum <= 63 &&
+             "Attempted to access a vsr out of range");
+      if (VSNum < 32)
+        return std::make_pair(PPC::VSL0 + VSNum, &PPC::VSRCRegClass);
+      return std::make_pair(PPC::V0 + VSNum - 32, &PPC::VSRCRegClass);
+    }
+
+    // For float registers, we can't defer to the base class as it will match
+    // the SPILLTOVSRRC class.
+    if (Constraint.size() > 3 && Constraint[1] == 'f') {
+      int RegNum = atoi(Constraint.data() + 2);
+      if (RegNum > 31 || RegNum < 0)
+        report_fatal_error("Invalid floating point register number");
+      if (VT == MVT::f32 || VT == MVT::i32)
+        return Subtarget.hasSPE()
+                   ? std::make_pair(PPC::R0 + RegNum, &PPC::GPRCRegClass)
+                   : std::make_pair(PPC::F0 + RegNum, &PPC::F4RCRegClass);
+      if (VT == MVT::f64 || VT == MVT::i64)
+        return Subtarget.hasSPE()
+                   ? std::make_pair(PPC::S0 + RegNum, &PPC::SPERCRegClass)
+                   : std::make_pair(PPC::F0 + RegNum, &PPC::F8RCRegClass);
+    }
   }
+
   std::pair<unsigned, const TargetRegisterClass *> R =
       TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 
diff --git a/llvm/test/CodeGen/PowerPC/inline-asm-physical-fpr-spe.ll b/llvm/test/CodeGen/PowerPC/inline-asm-physical-fpr-spe.ll
new file mode 100644
index 000000000000..94147501878f
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/inline-asm-physical-fpr-spe.ll
@@ -0,0 +1,23 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc-unknown-linux-gnu \
+; RUN:          -mattr=+spe |  FileCheck %s
+
+define i32 @test_f32(float %x) {
+; CHECK-LABEL: test_f32:
+; CHECK:         #APP
+; CHECK-NEXT:    efsctsi 31, 3
+; CHECK-NEXT:    #NO_APP
+entry:
+  %0 = call i32 asm sideeffect "efsctsi $0, $1", "={f31},f"(float %x)
+  ret i32 %0
+}
+
+define i32 @test_f64(double %x) {
+; CHECK-LABEL: test_f64:
+; CHECK:         #APP
+; CHECK-NEXT:    efdctsi 0, 3
+; CHECK-NEXT:    #NO_APP
+entry:
+  %0 = call i32 asm sideeffect "efdctsi $0, $1", "={f0},d"(double %x)
+  ret i32 %0
+}
+
diff --git a/llvm/test/CodeGen/PowerPC/inline-asm-physical-fpr.ll b/llvm/test/CodeGen/PowerPC/inline-asm-physical-fpr.ll
new file mode 100644
index 000000000000..132b5d150054
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/inline-asm-physical-fpr.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mcpu=pwr7 -mattr=-altivec -verify-machineinstrs \
+; RUN:     -mtriple=powerpc-unknown-aix < %s  | FileCheck %s
+
+; RUN: llc -mcpu=pwr7 -mattr=-altivec -verify-machineinstrs \
+; RUN:     -mtriple=powerpc64-unknown-aix < %s | FileCheck %s
+
+
+define dso_local double @test_double(double %a, double %b) {
+entry:
+  %0 = tail call double asm "fadd. $0,$1,$2\0A", "={f31},d,d,0"(double %a, double %b, double 0.000000e+00)
+  ret double %0
+}
+
+; CHECK-LABEL: test_double
+; CHECK:         #APP
+; CHECK-NEXT:    fadd. 31,1,2
+
+define dso_local signext i32 @test_int(double %a, double %b) {
+entry:
+  %0 = tail call i32 asm "fadd. $0,$1,$2\0A", "={f0},d,d,0"(double %a, double %b, i32 0)
+  ret i32 %0
+}
+
+; CHECK-LABEL: test_int
+; CHECK:         #APP
+; CHECK-NEXT:    fadd. 0,1,2

From 07234c7d6bc246925710b88a1f9552f678587165 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christoffer=20Lern=C3=B6?= <christoffer.lerno@gmail.com>
Date: Fri, 19 Mar 2021 18:55:52 -0400
Subject: [PATCH 308/318] Add type attributes to LLVM C API

The LLVM C API is missing type attributes as is needed by attributes
such as sret and byval. This patch adds three missing wrapper
functions.

Bugzilla: https://bugs.llvm.org/show_bug.cgi?id=48249

https://reviews.llvm.org/D97763
(cherry picked from commit 528f6f7d617757addac9b51dd5bcc1ab1352e9be)
---
 llvm/include/llvm-c/Core.h | 12 ++++++++++++
 llvm/lib/IR/Core.cpp       | 16 ++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h
index a78df16ca404..2901ab715810 100644
--- a/llvm/include/llvm-c/Core.h
+++ b/llvm/include/llvm-c/Core.h
@@ -604,6 +604,17 @@ unsigned LLVMGetEnumAttributeKind(LLVMAttributeRef A);
  */
 uint64_t LLVMGetEnumAttributeValue(LLVMAttributeRef A);
 
+/**
+ * Create a type attribute
+ */
+LLVMAttributeRef LLVMCreateTypeAttribute(LLVMContextRef C, unsigned KindID,
+                                         LLVMTypeRef type_ref);
+
+/**
+ * Get the type attribute's value.
+ */
+LLVMTypeRef LLVMGetTypeAttributeValue(LLVMAttributeRef A);
+
 /**
  * Create a string attribute.
  */
@@ -626,6 +637,7 @@ const char *LLVMGetStringAttributeValue(LLVMAttributeRef A, unsigned *Length);
  */
 LLVMBool LLVMIsEnumAttribute(LLVMAttributeRef A);
 LLVMBool LLVMIsStringAttribute(LLVMAttributeRef A);
+LLVMBool LLVMIsTypeAttribute(LLVMAttributeRef A);
 
 /**
  * Obtain a Type from a context by its registered name.
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index 90ba69069bae..039b34ace6ab 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -164,6 +164,18 @@ uint64_t LLVMGetEnumAttributeValue(LLVMAttributeRef A) {
   return Attr.getValueAsInt();
 }
 
+LLVMAttributeRef LLVMCreateTypeAttribute(LLVMContextRef C, unsigned KindID,
+                                         LLVMTypeRef type_ref) {
+  auto &Ctx = *unwrap(C);
+  auto AttrKind = (Attribute::AttrKind)KindID;
+  return wrap(Attribute::get(Ctx, AttrKind, unwrap(type_ref)));
+}
+
+LLVMTypeRef LLVMGetTypeAttributeValue(LLVMAttributeRef A) {
+  auto Attr = unwrap(A);
+  return wrap(Attr.getValueAsType());
+}
+
 LLVMAttributeRef LLVMCreateStringAttribute(LLVMContextRef C,
                                            const char *K, unsigned KLength,
                                            const char *V, unsigned VLength) {
@@ -194,6 +206,10 @@ LLVMBool LLVMIsStringAttribute(LLVMAttributeRef A) {
   return unwrap(A).isStringAttribute();
 }
 
+LLVMBool LLVMIsTypeAttribute(LLVMAttributeRef A) {
+  return unwrap(A).isTypeAttribute();
+}
+
 char *LLVMGetDiagInfoDescription(LLVMDiagnosticInfoRef DI) {
   std::string MsgStorage;
   raw_string_ostream Stream(MsgStorage);

From fd27379463917761a5e6135cfdb0b2d163a72ee3 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Wed, 9 Jun 2021 20:48:42 +0100
Subject: [PATCH 309/318] [ARM] MVE VPT block tests with debug info. NFC

---
 .../CodeGen/Thumb2/mve-vpt-block-debug.mir    | 114 ++++++++++++++++++
 1 file changed, 114 insertions(+)
 create mode 100644 llvm/test/CodeGen/Thumb2/mve-vpt-block-debug.mir

diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block-debug.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block-debug.mir
new file mode 100644
index 000000000000..cb36f48934f1
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block-debug.mir
@@ -0,0 +1,114 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -run-pass arm-mve-vpt %s -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+  target triple = "thumbv8.1m.main-none-unknown-eabihf"
+
+  define <4 x i32> @test(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) #0 !dbg !5 {
+  entry:
+    call void @llvm.dbg.value(metadata <4 x i32> %x, metadata !17, metadata !DIExpression()), !dbg !21
+    call void @llvm.dbg.value(metadata <4 x i32> %y, metadata !18, metadata !DIExpression()), !dbg !21
+    call void @llvm.dbg.value(metadata <4 x i32> %z, metadata !19, metadata !DIExpression()), !dbg !21
+    %0 = icmp sle <4 x i32> %x, %y, !dbg !22
+    call void @llvm.dbg.value(metadata i32 undef, metadata !20, metadata !DIExpression()), !dbg !21
+    %1 = tail call <4 x i32> @llvm.arm.mve.add.predicated.v4i32.v4i1(<4 x i32> %x, <4 x i32> %y, <4 x i1> %0, <4 x i32> %z), !dbg !23
+    call void @llvm.dbg.value(metadata <4 x i32> %1, metadata !19, metadata !DIExpression()), !dbg !21
+    %2 = icmp sgt <4 x i32> %x, %y, !dbg !24
+    call void @llvm.dbg.value(metadata i32 undef, metadata !20, metadata !DIExpression()), !dbg !21
+    %3 = tail call <4 x i32> @llvm.arm.mve.add.predicated.v4i32.v4i1(<4 x i32> %x, <4 x i32> %y, <4 x i1> %2, <4 x i32> %1), !dbg !25
+    call void @llvm.dbg.value(metadata <4 x i32> %3, metadata !19, metadata !DIExpression()), !dbg !21
+    ret <4 x i32> %3, !dbg !26
+  }
+
+  declare <4 x i32> @llvm.arm.mve.add.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #1
+  declare void @llvm.dbg.value(metadata, metadata, metadata) #2
+
+  attributes #0 = { "target-features"="+fullfp16,+lob,+mve.fp" }
+  attributes #1 = { nounwind readnone "target-features"="+fullfp16,+lob,+mve.fp" }
+  attributes #2 = { nofree nosync nounwind readnone speculatable willreturn "target-features"="+fullfp16,+lob,+mve.fp" }
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!3, !4}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 13.0.0 (https://github.com/llvm/llvm-project 921572a18dc9b97c259bda2ce8130f04b2ebe3ed)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+  !1 = !DIFile(filename: "tmp.c", directory: "/work/llvm-project/build")
+  !2 = !{}
+  !3 = !{i32 7, !"Dwarf Version", i32 4}
+  !4 = !{i32 2, !"Debug Info Version", i32 3}
+  !5 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 2, type: !6, scopeLine: 2, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !16)
+  !6 = !DISubroutineType(types: !7)
+  !7 = !{!8, !8, !8, !8}
+  !8 = !DIDerivedType(tag: DW_TAG_typedef, name: "int32x4_t", file: !9, line: 28, baseType: !10)
+  !9 = !DIFile(filename: "lib/clang/13.0.0/include/arm_mve.h", directory: "/work/llvm-project/build")
+  !10 = !DICompositeType(tag: DW_TAG_array_type, baseType: !11, size: 128, flags: DIFlagVector, elements: !14)
+  !11 = !DIDerivedType(tag: DW_TAG_typedef, name: "int32_t", file: !12, line: 58, baseType: !13)
+  !12 = !DIFile(filename: "pb-rel/testabletools/include/stdint.h", directory: "/work")
+  !13 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !14 = !{!15}
+  !15 = !DISubrange(count: 4)
+  !16 = !{!17, !18, !19, !20}
+  !17 = !DILocalVariable(name: "x", arg: 1, scope: !5, file: !1, line: 2, type: !8)
+  !18 = !DILocalVariable(name: "y", arg: 2, scope: !5, file: !1, line: 2, type: !8)
+  !19 = !DILocalVariable(name: "z", arg: 3, scope: !5, file: !1, line: 2, type: !8)
+  !20 = !DILocalVariable(name: "p", scope: !5, file: !1, line: 3, type: !13)
+  !21 = !DILocation(line: 0, scope: !5)
+  !22 = !DILocation(line: 3, column: 11, scope: !5)
+  !23 = !DILocation(line: 4, column: 7, scope: !5)
+  !24 = !DILocation(line: 5, column: 7, scope: !5)
+  !25 = !DILocation(line: 6, column: 7, scope: !5)
+  !26 = !DILocation(line: 7, column: 3, scope: !5)
+
+...
+---
+name:            test
+tracksRegLiveness: true
+liveins:
+  - { reg: '$q0', virtual-reg: '' }
+  - { reg: '$q1', virtual-reg: '' }
+  - { reg: '$q2', virtual-reg: '' }
+body:             |
+  bb.0.entry:
+    liveins: $q0, $q1, $q2
+
+    ; CHECK-LABEL: name: test
+    ; CHECK: liveins: $q0, $q1, $q2
+    ; CHECK: DBG_VALUE $q0, $noreg, !17, !DIExpression(), debug-location !21
+    ; CHECK: DBG_VALUE $q0, $noreg, !17, !DIExpression(), debug-location !21
+    ; CHECK: DBG_VALUE $q1, $noreg, !18, !DIExpression(), debug-location !21
+    ; CHECK: DBG_VALUE $q1, $noreg, !18, !DIExpression(), debug-location !21
+    ; CHECK: DBG_VALUE $q2, $noreg, !19, !DIExpression(), debug-location !21
+    ; CHECK: DBG_VALUE $q2, $noreg, !19, !DIExpression(), debug-location !21
+    ; CHECK: DBG_VALUE $noreg, $noreg, !20, !DIExpression(), debug-location !21
+    ; CHECK: BUNDLE implicit-def $vpr, implicit-def $q2, implicit-def $d4, implicit-def $s8, implicit-def $s9, implicit-def $d5, implicit-def $s10, implicit-def $s11, implicit $q1, implicit $q0, implicit killed $q2, debug-location !23 {
+    ; CHECK:   MVE_VPTv4s32 8, renamable $q1, renamable $q0, 10, implicit-def $vpr, debug-location !23
+    ; CHECK:   renamable $q2 = MVE_VADDi32 renamable $q0, renamable $q1, 1, internal renamable $vpr, killed renamable $q2, debug-location !23
+    ; CHECK: }
+    ; CHECK: DBG_VALUE $noreg, $noreg, !20, !DIExpression(), debug-location !21
+    ; CHECK: DBG_VALUE $q2, $noreg, !19, !DIExpression(), debug-location !21
+    ; CHECK: renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg, debug-location !24
+    ; CHECK: BUNDLE implicit-def $q2, implicit-def $d4, implicit-def $s8, implicit-def $s9, implicit-def $d5, implicit-def $s10, implicit-def $s11, implicit killed $vpr, implicit killed $q0, implicit killed $q1, implicit killed $q2, debug-location !25 {
+    ; CHECK:   MVE_VPST 8, implicit $vpr, debug-location !25
+    ; CHECK:   renamable $q2 = MVE_VADDi32 killed renamable $q0, killed renamable $q1, 1, killed renamable $vpr, killed renamable $q2, debug-location !25
+    ; CHECK: }
+    ; CHECK: DBG_VALUE $q2, $noreg, !19, !DIExpression(), debug-location !21
+    ; CHECK: $q0 = MVE_VORR killed $q2, killed $q2, 0, $noreg, undef $q0, debug-location !26
+    ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $q0, debug-location !26
+    DBG_VALUE $q0, $noreg, !17, !DIExpression(), debug-location !21
+    DBG_VALUE $q0, $noreg, !17, !DIExpression(), debug-location !21
+    DBG_VALUE $q1, $noreg, !18, !DIExpression(), debug-location !21
+    DBG_VALUE $q1, $noreg, !18, !DIExpression(), debug-location !21
+    DBG_VALUE $q2, $noreg, !19, !DIExpression(), debug-location !21
+    DBG_VALUE $q2, $noreg, !19, !DIExpression(), debug-location !21
+    renamable $vpr = MVE_VCMPs32 renamable $q1, renamable $q0, 10, 0, $noreg, debug-location !22
+    DBG_VALUE $noreg, $noreg, !20, !DIExpression(), debug-location !21
+    renamable $q2 = MVE_VADDi32 renamable $q0, renamable $q1, 1, renamable $vpr, killed renamable $q2, debug-location !23
+    DBG_VALUE $noreg, $noreg, !20, !DIExpression(), debug-location !21
+    DBG_VALUE $q2, $noreg, !19, !DIExpression(), debug-location !21
+    renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg, debug-location !24
+    renamable $q2 = MVE_VADDi32 killed renamable $q0, killed renamable $q1, 1, killed renamable $vpr, killed renamable $q2, debug-location !25
+    DBG_VALUE $q2, $noreg, !19, !DIExpression(), debug-location !21
+    $q0 = MVE_VORR killed $q2, killed $q2, 0, $noreg, undef $q0, debug-location !26
+    tBX_RET 14 /* CC::al */, $noreg, implicit $q0, debug-location !26
+
+...

From c7381b628d63b4423617163ee1116e64f49d2265 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 10 Jun 2021 14:49:04 +0100
Subject: [PATCH 310/318] [ARM] Skip debug during vpt block creation

Debug info is currently preventing VPT block creation, leading to
different codegen. This patch attempts to skip any debug instructions
during vpt block creation, making sure they do not interfere.

Differential Revision: https://reviews.llvm.org/D103610
---
 llvm/lib/Target/ARM/MVEVPTBlockPass.cpp          | 13 ++++++++++++-
 llvm/test/CodeGen/Thumb2/mve-vpt-block-debug.mir | 16 ++++++----------
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp b/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp
index 9a710b784fd1..c7f451cba14f 100644
--- a/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp
+++ b/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp
@@ -107,6 +107,12 @@ static bool StepOverPredicatedInstrs(MachineBasicBlock::instr_iterator &Iter,
   NumInstrsSteppedOver = 0;
 
   while (Iter != EndIter) {
+    if (Iter->isDebugInstr()) {
+      // Skip debug instructions
+      ++Iter;
+      continue;
+    }
+
     NextPred = getVPTInstrPredicate(*Iter, PredReg);
     assert(NextPred != ARMVCC::Else &&
            "VPT block pass does not expect Else preds");
@@ -170,6 +176,8 @@ CreateVPTBlock(MachineBasicBlock::instr_iterator &Iter,
   LLVM_DEBUG(for (MachineBasicBlock::instr_iterator AddedInstIter =
                       std::next(BlockBeg);
                   AddedInstIter != Iter; ++AddedInstIter) {
+    if (AddedInstIter->isDebugInstr())
+      continue;
     dbgs() << "  adding: ";
     AddedInstIter->dump();
   });
@@ -197,7 +205,7 @@ CreateVPTBlock(MachineBasicBlock::instr_iterator &Iter,
     if (!IsVPRDefinedOrKilledByBlock(Iter, VPNOTBlockEndIter))
       break;
 
-    LLVM_DEBUG(dbgs() << "  removing VPNOT: "; Iter->dump(););
+    LLVM_DEBUG(dbgs() << "  removing VPNOT: "; Iter->dump());
 
     // Record the new size of the block
     BlockSize += ElseInstCnt;
@@ -211,6 +219,9 @@ CreateVPTBlock(MachineBasicBlock::instr_iterator &Iter,
     // Note that we are using "Iter" to iterate over the block so we can update
     // it at the same time.
     for (; Iter != VPNOTBlockEndIter; ++Iter) {
+      if (Iter->isDebugInstr())
+        continue;
+
       // Find the register in which the predicate is
       int OpIdx = findFirstVPTPredOperandIdx(*Iter);
       assert(OpIdx != -1);
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block-debug.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block-debug.mir
index cb36f48934f1..ce540e506131 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-block-debug.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block-debug.mir
@@ -80,18 +80,14 @@ body:             |
     ; CHECK: DBG_VALUE $q2, $noreg, !19, !DIExpression(), debug-location !21
     ; CHECK: DBG_VALUE $q2, $noreg, !19, !DIExpression(), debug-location !21
     ; CHECK: DBG_VALUE $noreg, $noreg, !20, !DIExpression(), debug-location !21
-    ; CHECK: BUNDLE implicit-def $vpr, implicit-def $q2, implicit-def $d4, implicit-def $s8, implicit-def $s9, implicit-def $d5, implicit-def $s10, implicit-def $s11, implicit $q1, implicit $q0, implicit killed $q2, debug-location !23 {
-    ; CHECK:   MVE_VPTv4s32 8, renamable $q1, renamable $q0, 10, implicit-def $vpr, debug-location !23
+    ; CHECK: BUNDLE implicit-def dead $vpr, implicit-def $q2, implicit-def $d4, implicit-def $s8, implicit-def $s9, implicit-def $d5, implicit-def $s10, implicit-def $s11, implicit killed $q1, implicit killed $q0, implicit killed $q2, debug-location !23 {
+    ; CHECK:   MVE_VPTv4s32 12, renamable $q1, renamable $q0, 10, implicit-def $vpr, debug-location !23
     ; CHECK:   renamable $q2 = MVE_VADDi32 renamable $q0, renamable $q1, 1, internal renamable $vpr, killed renamable $q2, debug-location !23
+    ; CHECK:   DBG_VALUE $noreg, $noreg, !20, !DIExpression(), debug-location !21
+    ; CHECK:   DBG_VALUE internal $q2, $noreg, !19, !DIExpression(), debug-location !21
+    ; CHECK:   renamable $q2 = MVE_VADDi32 killed renamable $q0, killed renamable $q1, 2, internal killed renamable $vpr, internal killed renamable $q2, debug-location !25
+    ; CHECK:   DBG_VALUE internal $q2, $noreg, !19, !DIExpression(), debug-location !21
     ; CHECK: }
-    ; CHECK: DBG_VALUE $noreg, $noreg, !20, !DIExpression(), debug-location !21
-    ; CHECK: DBG_VALUE $q2, $noreg, !19, !DIExpression(), debug-location !21
-    ; CHECK: renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg, debug-location !24
-    ; CHECK: BUNDLE implicit-def $q2, implicit-def $d4, implicit-def $s8, implicit-def $s9, implicit-def $d5, implicit-def $s10, implicit-def $s11, implicit killed $vpr, implicit killed $q0, implicit killed $q1, implicit killed $q2, debug-location !25 {
-    ; CHECK:   MVE_VPST 8, implicit $vpr, debug-location !25
-    ; CHECK:   renamable $q2 = MVE_VADDi32 killed renamable $q0, killed renamable $q1, 1, killed renamable $vpr, killed renamable $q2, debug-location !25
-    ; CHECK: }
-    ; CHECK: DBG_VALUE $q2, $noreg, !19, !DIExpression(), debug-location !21
     ; CHECK: $q0 = MVE_VORR killed $q2, killed $q2, 0, $noreg, undef $q0, debug-location !26
     ; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $q0, debug-location !26
     DBG_VALUE $q0, $noreg, !17, !DIExpression(), debug-location !21

From a3543fd9d47054596fed913b8ddc68285200c821 Mon Sep 17 00:00:00 2001
From: Victor Campos <victor.campos@arm.com>
Date: Fri, 19 Mar 2021 11:19:32 +0000
Subject: [PATCH 311/318] [ARM] Handle debug instrs in ARM Low Overhead Loop
 pass

In function ConvertVPTBlocks(), it is assumed that every instruction
within a vector-predicated block is predicated. This is false for debug
instructions, used by LLVM.

Because of this, an assertion failure is reached when an input contains
debug instructions inside VPT blocks. In non-assert builds, an out of
bounds memory access took place.

The present patch properly covers the case of debug instructions.

Reviewed By: dmgreen

Differential Revision: https://reviews.llvm.org/D99075
---
 llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp   |  20 +-
 .../LowOverheadLoops/skip-vpt-debug.mir       | 330 ++++++++++++++++++
 2 files changed, 343 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/CodeGen/Thumb2/LowOverheadLoops/skip-vpt-debug.mir

diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index 61a924078f29..8dc532058492 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -1467,14 +1467,15 @@ MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
 
 void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
   auto RemovePredicate = [](MachineInstr *MI) {
+    if (MI->isDebugInstr())
+      return;
     LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI);
-    if (int PIdx = llvm::findFirstVPTPredOperandIdx(*MI)) {
-      assert(MI->getOperand(PIdx).getImm() == ARMVCC::Then &&
-             "Expected Then predicate!");
-      MI->getOperand(PIdx).setImm(ARMVCC::None);
-      MI->getOperand(PIdx+1).setReg(0);
-    } else
-      llvm_unreachable("trying to unpredicate a non-predicated instruction");
+    int PIdx = llvm::findFirstVPTPredOperandIdx(*MI);
+    assert(PIdx >= 1 && "Trying to unpredicate a non-predicated instruction");
+    assert(MI->getOperand(PIdx).getImm() == ARMVCC::Then &&
+           "Expected Then predicate!");
+    MI->getOperand(PIdx).setImm(ARMVCC::None);
+    MI->getOperand(PIdx + 1).setReg(0);
   };
 
   for (auto &Block : LoLoop.getVPTBlocks()) {
@@ -1518,8 +1519,13 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
         // - Insert a new vpst to predicate the instruction(s) that following
         //   the divergent vpr def.
         MachineInstr *Divergent = VPTState::getDivergent(Block);
+        MachineBasicBlock *MBB = Divergent->getParent();
         auto DivergentNext = ++MachineBasicBlock::iterator(Divergent);
+        while (DivergentNext != MBB->end() && DivergentNext->isDebugInstr())
+          ++DivergentNext;
+
         bool DivergentNextIsPredicated =
+            DivergentNext != MBB->end() &&
             getVPTInstrPredicate(*DivergentNext) != ARMVCC::None;
 
         for (auto I = ++MachineBasicBlock::iterator(VPST), E = DivergentNext;
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/skip-vpt-debug.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/skip-vpt-debug.mir
new file mode 100644
index 000000000000..8637ab3f1856
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/skip-vpt-debug.mir
@@ -0,0 +1,330 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - --verify-machineinstrs | FileCheck %s
+--- |
+  ; ModuleID = 'skip-vpt-debug.ll'
+  source_filename = "skip-vpt-debug.c"
+  target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+  target triple = "thumbv8.1m.main-arm-none-eabihf"
+
+  ; Function Attrs: nofree norecurse nounwind optsize
+  define hidden void @arm_max_no_idx_f32(float* nocapture readonly %pSrc, i32 %blockSize, float* nocapture %pResult) local_unnamed_addr #0 !dbg !13 {
+  entry:
+    call void @llvm.dbg.value(metadata float* %pSrc, metadata !24, metadata !DIExpression()), !dbg !29
+    call void @llvm.dbg.value(metadata i32 %blockSize, metadata !25, metadata !DIExpression()), !dbg !29
+    call void @llvm.dbg.value(metadata float* %pResult, metadata !26, metadata !DIExpression()), !dbg !29
+    call void @llvm.dbg.value(metadata float 0x3810000000000000, metadata !27, metadata !DIExpression()), !dbg !29
+    %cmp.not7 = icmp eq i32 %blockSize, 0, !dbg !30
+    br i1 %cmp.not7, label %while.end, label %vector.ph, !dbg !31
+
+  vector.ph:                                        ; preds = %entry
+    %n.rnd.up = add i32 %blockSize, 3, !dbg !31
+    %n.vec = and i32 %n.rnd.up, -4, !dbg !31
+    %0 = add i32 %n.vec, -4, !dbg !31
+    %1 = lshr i32 %0, 2, !dbg !31
+    %2 = add nuw nsw i32 %1, 1, !dbg !31
+    %3 = call i32 @llvm.start.loop.iterations.i32(i32 %2), !dbg !31
+    br label %vector.body, !dbg !31
+
+  vector.body:                                      ; preds = %vector.body, %vector.ph
+    %lsr.iv1 = phi float* [ %scevgep, %vector.body ], [ %pSrc, %vector.ph ]
+    %vec.phi = phi <4 x float> [ <float 0x3810000000000000, float 0x3810000000000000, float 0x3810000000000000, float 0x3810000000000000>, %vector.ph ], [ %10, %vector.body ]
+    %4 = phi i32 [ %3, %vector.ph ], [ %11, %vector.body ]
+    %5 = phi i32 [ %blockSize, %vector.ph ], [ %7, %vector.body ]
+    %lsr.iv12 = bitcast float* %lsr.iv1 to <4 x float>*
+    %6 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %5)
+    %7 = sub i32 %5, 4
+    %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %lsr.iv12, i32 4, <4 x i1> %6, <4 x float> poison), !dbg !32, !tbaa !34
+    %8 = fcmp nnan ninf nsz olt <4 x float> %vec.phi, %wide.masked.load, !dbg !38
+    %9 = and <4 x i1> %6, %8, !dbg !40
+    %10 = select <4 x i1> %9, <4 x float> %wide.masked.load, <4 x float> %vec.phi, !dbg !40
+    %scevgep = getelementptr float, float* %lsr.iv1, i32 4
+    %11 = call i32 @llvm.loop.decrement.reg.i32(i32 %4, i32 1)
+    %12 = icmp ne i32 %11, 0
+    br i1 %12, label %vector.body, label %middle.block, !llvm.loop !41
+
+  middle.block:                                     ; preds = %vector.body
+    %13 = call nnan ninf nsz float @llvm.vector.reduce.fmax.v4f32(<4 x float> %10), !dbg !31
+    br label %while.end, !dbg !45
+
+  while.end:                                        ; preds = %middle.block, %entry
+    %maxValue.0.lcssa = phi float [ 0x3810000000000000, %entry ], [ %13, %middle.block ], !dbg !29
+    store float %maxValue.0.lcssa, float* %pResult, align 4, !dbg !45, !tbaa !34
+    ret void, !dbg !46
+  }
+
+  ; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
+  declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
+  ; Function Attrs: nofree nosync nounwind readnone willreturn
+  declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) #2
+
+  ; Function Attrs: argmemonly nofree nosync nounwind readonly willreturn
+  declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) #3
+
+  ; Function Attrs: nofree nosync nounwind readnone willreturn
+  declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) #2
+
+  ; Function Attrs: noduplicate nofree nosync nounwind willreturn
+  declare i32 @llvm.start.loop.iterations.i32(i32) #4
+
+  ; Function Attrs: noduplicate nofree nosync nounwind willreturn
+  declare i32 @llvm.loop.decrement.reg.i32(i32, i32) #4
+
+  ; Function Attrs: nounwind readnone
+  declare <4 x i1> @llvm.arm.mve.vctp32(i32) #5
+
+  attributes #0 = { nofree norecurse nounwind optsize "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" "frame-pointer"="none" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-dotprod,-fp16fml,-hwdiv-arm,-i8mm,-sb,-sha2" }
+  attributes #1 = { nofree nosync nounwind readnone speculatable willreturn }
+  attributes #2 = { nofree nosync nounwind readnone willreturn }
+  attributes #3 = { argmemonly nofree nosync nounwind readonly willreturn }
+  attributes #4 = { noduplicate nofree nosync nounwind willreturn }
+  attributes #5 = { nounwind readnone }
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!3, !4, !5, !6, !7, !8, !9, !10, !11}
+  !llvm.ident = !{!12}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "Component: ARM Compiler 6.17.0.0 (permissive) Tool: armclang [00000000]", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+  !1 = !DIFile(filename: "skip-vpt-debug.c", directory: "/home/vicspe01")
+  !2 = !{}
+  !3 = !{i32 7, !"Dwarf Version", i32 4}
+  !4 = !{i32 2, !"Debug Info Version", i32 3}
+  !5 = !{i32 1, !"wchar_size", i32 4}
+  !6 = !{i32 1, !"static_rwdata", i32 1}
+  !7 = !{i32 1, !"enumsize_buildattr", i32 2}
+  !8 = !{i32 1, !"armlib_unavailable", i32 0}
+  !9 = !{i32 1, !"branch-target-enforcement", i32 0}
+  !10 = !{i32 1, !"sign-return-address", i32 0}
+  !11 = !{i32 1, !"sign-return-address-all", i32 0}
+  !12 = !{!"Component: ARM Compiler 6.17.0.0 (permissive) Tool: armclang [00000000]"}
+  !13 = distinct !DISubprogram(name: "arm_max_no_idx_f32", scope: !1, file: !1, line: 5, type: !14, scopeLine: 6, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !23)
+  !14 = !DISubroutineType(types: !15)
+  !15 = !{null, !16, !20, !22}
+  !16 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !17, size: 32)
+  !17 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !18)
+  !18 = !DIDerivedType(tag: DW_TAG_typedef, name: "float32_t", file: !1, line: 1, baseType: !19)
+  !19 = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float)
+  !20 = !DIDerivedType(tag: DW_TAG_typedef, name: "uint32_t", file: !1, line: 2, baseType: !21)
+  !21 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+  !22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !18, size: 32)
+  !23 = !{!24, !25, !26, !27, !28}
+  !24 = !DILocalVariable(name: "pSrc", arg: 1, scope: !13, file: !1, line: 5, type: !16)
+  !25 = !DILocalVariable(name: "blockSize", arg: 2, scope: !13, file: !1, line: 5, type: !20)
+  !26 = !DILocalVariable(name: "pResult", arg: 3, scope: !13, file: !1, line: 6, type: !22)
+  !27 = !DILocalVariable(name: "maxValue", scope: !13, file: !1, line: 7, type: !18)
+  !28 = !DILocalVariable(name: "newVal", scope: !13, file: !1, line: 8, type: !18)
+  !29 = !DILocation(line: 0, scope: !13)
+  !30 = !DILocation(line: 10, column: 20, scope: !13)
+  !31 = !DILocation(line: 10, column: 3, scope: !13)
+  !32 = !DILocation(line: 11, column: 14, scope: !33)
+  !33 = distinct !DILexicalBlock(scope: !13, file: !1, line: 10, column: 26)
+  !34 = !{!35, !35, i64 0}
+  !35 = !{!"float", !36, i64 0}
+  !36 = !{!"omnipotent char", !37, i64 0}
+  !37 = !{!"Simple C/C++ TBAA"}
+  !38 = !DILocation(line: 12, column: 18, scope: !39)
+  !39 = distinct !DILexicalBlock(scope: !33, file: !1, line: 12, column: 9)
+  !40 = !DILocation(line: 12, column: 9, scope: !33)
+  !41 = distinct !{!41, !31, !42, !43, !44}
+  !42 = !DILocation(line: 15, column: 3, scope: !13)
+  !43 = !{!"llvm.loop.mustprogress"}
+  !44 = !{!"llvm.loop.isvectorized", i32 1}
+  !45 = !DILocation(line: 16, column: 12, scope: !13)
+  !46 = !DILocation(line: 17, column: 1, scope: !13)
+
+...
+---
+name:            arm_max_no_idx_f32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:       []
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+  - { reg: '$r2', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       8
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:       []
+debugValueSubstitutions: []
+constants:
+  - id:              0
+    value:           float 0x3810000000000000
+    alignment:       4
+    isTargetSpecific: false
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: arm_max_no_idx_f32
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.4(0x30000000), %bb.1(0x50000000)
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r7
+  ; CHECK:   DBG_VALUE $r0, $noreg, !24, !DIExpression(), debug-location !29
+  ; CHECK:   DBG_VALUE $r1, $noreg, !25, !DIExpression(), debug-location !29
+  ; CHECK:   DBG_VALUE $r1, $noreg, !25, !DIExpression(), debug-location !29
+  ; CHECK:   DBG_VALUE $r2, $noreg, !26, !DIExpression(), debug-location !29
+  ; CHECK:   DBG_VALUE $r2, $noreg, !26, !DIExpression(), debug-location !29
+  ; CHECK:   frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
+  ; CHECK:   DBG_VALUE float 0x3810000000000000, $noreg, !27, !DIExpression(), debug-location !29
+  ; CHECK:   DBG_VALUE $r0, $noreg, !24, !DIExpression(), debug-location !29
+  ; CHECK:   tCBZ renamable $r1, %bb.4, debug-location !31
+  ; CHECK: bb.1.vector.ph:
+  ; CHECK:   successors: %bb.2(0x80000000)
+  ; CHECK:   liveins: $r0, $r1, $r2
+  ; CHECK:   DBG_VALUE float 0x3810000000000000, $noreg, !27, !DIExpression(), debug-location !29
+  ; CHECK:   DBG_VALUE $r2, $noreg, !26, !DIExpression(), debug-location !29
+  ; CHECK:   DBG_VALUE $r1, $noreg, !25, !DIExpression(), debug-location !29
+  ; CHECK:   DBG_VALUE $r0, $noreg, !24, !DIExpression(), debug-location !29
+  ; CHECK:   renamable $q0 = MVE_VMOVimmi32 1152, 0, $noreg, undef renamable $q0
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r1, debug-location !31
+  ; CHECK: bb.2.vector.body (align 4):
+  ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
+  ; CHECK:   liveins: $lr, $q0, $r0, $r2
+  ; CHECK:   DBG_VALUE float 0x3810000000000000, $noreg, !27, !DIExpression(), debug-location !29
+  ; CHECK:   DBG_VALUE $r2, $noreg, !26, !DIExpression(), debug-location !29
+  ; CHECK:   renamable $r0, renamable $q1 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg, debug-location !32 :: (load 16 from %ir.lsr.iv12, align 4, !tbaa !34)
+  ; CHECK:   DBG_VALUE $r0, $noreg, !24, !DIExpression(DW_OP_LLVM_entry_value, 1), debug-location !29
+  ; CHECK:   MVE_VPTv4f32 8, renamable $q1, renamable $q0, 12, implicit-def $vpr, debug-location !40
+  ; CHECK:   renamable $q0 = MVE_VORR killed renamable $q1, killed renamable $q1, 1, killed renamable $vpr, killed renamable $q0, debug-location !40
+  ; CHECK:   DBG_VALUE $r1, $noreg, !25, !DIExpression(DW_OP_LLVM_entry_value, 1), debug-location !29
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
+  ; CHECK: bb.3.middle.block:
+  ; CHECK:   successors: %bb.5(0x80000000)
+  ; CHECK:   liveins: $q0, $r2
+  ; CHECK:   DBG_VALUE float 0x3810000000000000, $noreg, !27, !DIExpression(), debug-location !29
+  ; CHECK:   DBG_VALUE $r2, $noreg, !26, !DIExpression(), debug-location !29
+  ; CHECK:   renamable $s4 = nnan ninf nsz VFP_VMAXNMS renamable $s2, renamable $s3, debug-location !31
+  ; CHECK:   renamable $s0 = nnan ninf nsz VFP_VMAXNMS killed renamable $s0, killed renamable $s1, implicit killed $q0, debug-location !31
+  ; CHECK:   renamable $s0 = nnan ninf nsz VFP_VMAXNMS killed renamable $s0, killed renamable $s4, debug-location !31
+  ; CHECK:   tB %bb.5, 14 /* CC::al */, $noreg
+  ; CHECK: bb.4:
+  ; CHECK:   successors: %bb.5(0x80000000)
+  ; CHECK:   liveins: $r2
+  ; CHECK:   DBG_VALUE float 0x3810000000000000, $noreg, !27, !DIExpression(), debug-location !29
+  ; CHECK:   DBG_VALUE $r2, $noreg, !26, !DIExpression(), debug-location !29
+  ; CHECK:   DBG_VALUE $r1, $noreg, !25, !DIExpression(), debug-location !29
+  ; CHECK:   DBG_VALUE $r0, $noreg, !24, !DIExpression(), debug-location !29
+  ; CHECK:   renamable $s0 = VLDRS %const.0, 0, 14 /* CC::al */, $noreg :: (load 4 from constant-pool)
+  ; CHECK: bb.5.while.end:
+  ; CHECK:   liveins: $r2, $s0
+  ; CHECK:   DBG_VALUE float 0x3810000000000000, $noreg, !27, !DIExpression(), debug-location !29
+  ; CHECK:   DBG_VALUE $r2, $noreg, !26, !DIExpression(), debug-location !29
+  ; CHECK:   VSTRS killed renamable $s0, killed renamable $r2, 0, 14 /* CC::al */, $noreg, debug-location !45 :: (store 4 into %ir.pResult, !tbaa !34)
+  ; CHECK:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, debug-location !46
+  ; CHECK: bb.6 (align 4):
+  ; CHECK:   CONSTPOOL_ENTRY 0, %const.0, 4
+  bb.0.entry:
+    successors: %bb.4(0x30000000), %bb.1(0x50000000)
+    liveins: $r0, $r1, $r2, $r7, $lr
+
+    DBG_VALUE $r0, $noreg, !24, !DIExpression(), debug-location !29
+    DBG_VALUE $r1, $noreg, !25, !DIExpression(), debug-location !29
+    DBG_VALUE $r1, $noreg, !25, !DIExpression(), debug-location !29
+    DBG_VALUE $r2, $noreg, !26, !DIExpression(), debug-location !29
+    DBG_VALUE $r2, $noreg, !26, !DIExpression(), debug-location !29
+    frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r7, -8
+    DBG_VALUE float 0x3810000000000000, $noreg, !27, !DIExpression(), debug-location !29
+    DBG_VALUE $r0, $noreg, !24, !DIExpression(), debug-location !29
+    tCBZ renamable $r1, %bb.4, debug-location !31
+
+  bb.1.vector.ph:
+    successors: %bb.2(0x80000000)
+    liveins: $r0, $r1, $r2
+
+    DBG_VALUE float 0x3810000000000000, $noreg, !27, !DIExpression(), debug-location !29
+    DBG_VALUE $r2, $noreg, !26, !DIExpression(), debug-location !29
+    DBG_VALUE $r1, $noreg, !25, !DIExpression(), debug-location !29
+    DBG_VALUE $r0, $noreg, !24, !DIExpression(), debug-location !29
+    renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg, debug-location !31
+    renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg, debug-location !31
+    renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg, debug-location !31
+    renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
+    renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg, debug-location !31
+    renamable $q0 = MVE_VMOVimmi32 1152, 0, $noreg, undef renamable $q0
+    renamable $lr = t2DoLoopStartTP killed renamable $r3, renamable $r1, debug-location !31
+
+  bb.2.vector.body (align 4):
+    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
+    liveins: $lr, $q0, $r0, $r1, $r2
+
+    DBG_VALUE float 0x3810000000000000, $noreg, !27, !DIExpression(), debug-location !29
+    DBG_VALUE $r2, $noreg, !26, !DIExpression(), debug-location !29
+    renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg
+    MVE_VPST 2, implicit $vpr, debug-location !32
+    renamable $r0, renamable $q1 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr, debug-location !32 :: (load 16 from %ir.lsr.iv12, align 4, !tbaa !34)
+    DBG_VALUE $r0, $noreg, !24, !DIExpression(DW_OP_LLVM_entry_value, 1), debug-location !29
+    renamable $vpr = MVE_VCMPf32 renamable $q1, renamable $q0, 12, 1, killed renamable $vpr, debug-location !40
+    renamable $q0 = MVE_VORR killed renamable $q1, renamable $q1, 1, killed renamable $vpr, killed renamable $q0, debug-location !40
+    renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg
+    DBG_VALUE $r1, $noreg, !25, !DIExpression(DW_OP_LLVM_entry_value, 1), debug-location !29
+    renamable $lr = t2LoopEndDec killed renamable $lr, %bb.2, implicit-def dead $cpsr
+    tB %bb.3, 14 /* CC::al */, $noreg
+
+  bb.3.middle.block:
+    successors: %bb.5(0x80000000)
+    liveins: $q0, $r2
+
+    DBG_VALUE float 0x3810000000000000, $noreg, !27, !DIExpression(), debug-location !29
+    DBG_VALUE $r2, $noreg, !26, !DIExpression(), debug-location !29
+    renamable $s4 = nnan ninf nsz VFP_VMAXNMS renamable $s2, renamable $s3, debug-location !31
+    renamable $s0 = nnan ninf nsz VFP_VMAXNMS killed renamable $s0, killed renamable $s1, implicit $q0, debug-location !31
+    renamable $s0 = nnan ninf nsz VFP_VMAXNMS killed renamable $s0, killed renamable $s4, debug-location !31
+    tB %bb.5, 14 /* CC::al */, $noreg
+
+  bb.4:
+    successors: %bb.5(0x80000000)
+    liveins: $r2
+
+    DBG_VALUE float 0x3810000000000000, $noreg, !27, !DIExpression(), debug-location !29
+    DBG_VALUE $r2, $noreg, !26, !DIExpression(), debug-location !29
+    DBG_VALUE $r1, $noreg, !25, !DIExpression(), debug-location !29
+    DBG_VALUE $r0, $noreg, !24, !DIExpression(), debug-location !29
+    renamable $s0 = VLDRS %const.0, 0, 14 /* CC::al */, $noreg :: (load 4 from constant-pool)
+
+  bb.5.while.end:
+    liveins: $r2, $s0
+
+    DBG_VALUE float 0x3810000000000000, $noreg, !27, !DIExpression(), debug-location !29
+    DBG_VALUE $r2, $noreg, !26, !DIExpression(), debug-location !29
+    VSTRS killed renamable $s0, killed renamable $r2, 0, 14 /* CC::al */, $noreg, debug-location !45 :: (store 4 into %ir.pResult, !tbaa !34)
+    frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, debug-location !46
+
+  bb.6 (align 4):
+    CONSTPOOL_ENTRY 0, %const.0, 4
+
+...

From 0f3fec4618e40f54ec2a042cb603cdedd253312c Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Sun, 30 May 2021 18:02:14 +0100
Subject: [PATCH 312/318] [ARM] Guard against loop variant gather ptr operands

This ensures that the operands of any gather/scatter instructions that
we attempt to push out of the loop are invariant, preventing invalid IR
from being generated.
---
 .../Target/ARM/MVEGatherScatterLowering.cpp   |  3 +-
 .../Thumb2/mve-gather-optimisation-deep.ll    | 44 +++++++++++++++++++
 2 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
index 81f113b8302f..039f6f2053d8 100644
--- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
+++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
@@ -960,7 +960,8 @@ bool MVEGatherScatterLowering::optimiseOffsets(Value *Offsets, BasicBlock *BB,
   // Get the value that is added to/multiplied with the phi
   Value *OffsSecondOperand = Offs->getOperand(OffsSecondOp);
 
-  if (IncrementPerRound->getType() != OffsSecondOperand->getType())
+  if (IncrementPerRound->getType() != OffsSecondOperand->getType() ||
+      !L->isLoopInvariant(OffsSecondOperand))
     // Something has gone wrong, abort
     return false;
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll b/llvm/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll
index 4c5bcd836c37..4e971542bf75 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll
@@ -187,4 +187,48 @@ end:
   ret void;
 }
 
+define arm_aapcs_vfpcc void @invariant_add(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
+; CHECK-LABEL: @invariant_add(
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[L0:%.*]] = mul <4 x i32> [[VEC_IND]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[L1:%.*]] = add <4 x i32> [[L0]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[DATA:%.*]], <4 x i32> [[L1]], i32 32, i32 2, i32 1)
+; CHECK-NEXT:    [[L3:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[L4:%.*]] = bitcast i32* [[L3]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP0]], <4 x i32>* [[L4]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT:    [[L5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC:%.*]]
+; CHECK-NEXT:    br i1 [[L5]], label [[END:%.*]], label [[VECTOR_BODY]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+  %l0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
+  %l1 = add <4 x i32> %l0, %vec.ind
+  %l2 = getelementptr inbounds i32, i32* %data, <4 x i32> %l1
+  %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %l2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  %l3 = getelementptr inbounds i32, i32* %dst, i32 %index
+  %l4 = bitcast i32* %l3 to <4 x i32>*
+  store <4 x i32> %wide.masked.gather, <4 x i32>* %l4, align 4
+  %index.next = add i32 %index, 4
+  %vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
+  %l5 = icmp eq i32 %index.next, %n.vec
+  br i1 %l5, label %end, label %vector.body
+
+end:
+  ret void;
+}
+
+
 declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)

From e2e2057132c1360d014235a087d4f678efc56420 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Mon, 26 Apr 2021 10:04:33 +0100
Subject: [PATCH 313/318] [ARM] Ensure loop invariant active.lane.mask operands

CGP can move instructions like a ptrtoint into a loop, but the
MVETailPredication when converting them will currently assume invariant
trip counts. This tries to ensure the operands are loop invariant, and
bails if not.

Differential Revision: https://reviews.llvm.org/D100550
---
 llvm/lib/Target/ARM/MVETailPredication.cpp    |   4 +
 .../Thumb2/mve-tailpred-loopinvariant.ll      | 145 ++++++++++++++++++
 2 files changed, 149 insertions(+)
 create mode 100644 llvm/test/CodeGen/Thumb2/mve-tailpred-loopinvariant.ll

diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp
index b705208660df..cccac5595288 100644
--- a/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -205,6 +205,10 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
     EnableTailPredication == TailPredication::ForceEnabled;
 
   Value *ElemCount = ActiveLaneMask->getOperand(1);
+  bool Changed = false;
+  if (!L->makeLoopInvariant(ElemCount, Changed))
+    return false;
+
   auto *EC= SE->getSCEV(ElemCount);
   auto *TC = SE->getSCEV(TripCount);
   int VectorWidth =
diff --git a/llvm/test/CodeGen/Thumb2/mve-tailpred-loopinvariant.ll b/llvm/test/CodeGen/Thumb2/mve-tailpred-loopinvariant.ll
new file mode 100644
index 000000000000..2bd3e51772bf
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-tailpred-loopinvariant.ll
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
+
+; This test has an instruction that gets sunk into the loop, that is a
+; active.lane.mask operand. (%exitcount.ptrcnt.to.int = ptrtoint). We
+; need to make sure it is loop invariant.
+
+define i32 @a(i32* readnone %b, i8* %c) {
+; CHECK-LABEL: a:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    it ls
+; CHECK-NEXT:    popls {r4, pc}
+; CHECK-NEXT:  .LBB0_1: @ %while.body.preheader
+; CHECK-NEXT:    subs r0, r0, r1
+; CHECK-NEXT:    movs r3, #1
+; CHECK-NEXT:    add.w r2, r0, #15
+; CHECK-NEXT:    mov r12, r1
+; CHECK-NEXT:    bic r2, r2, #15
+; CHECK-NEXT:    subs r2, #16
+; CHECK-NEXT:    add.w lr, r3, r2, lsr #4
+; CHECK-NEXT:    movs r2, #0
+; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:  .LBB0_2: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    adds r3, r1, r2
+; CHECK-NEXT:    vctp.8 r0
+; CHECK-NEXT:    vmov.8 q0[0], r3
+; CHECK-NEXT:    adds r4, r3, #1
+; CHECK-NEXT:    vmov.8 q0[1], r4
+; CHECK-NEXT:    adds r4, r3, #2
+; CHECK-NEXT:    vmov.8 q0[2], r4
+; CHECK-NEXT:    adds r4, r3, #3
+; CHECK-NEXT:    vmov.8 q0[3], r4
+; CHECK-NEXT:    adds r4, r3, #4
+; CHECK-NEXT:    vmov.8 q0[4], r4
+; CHECK-NEXT:    adds r4, r3, #5
+; CHECK-NEXT:    vmov.8 q0[5], r4
+; CHECK-NEXT:    adds r4, r3, #6
+; CHECK-NEXT:    vmov.8 q0[6], r4
+; CHECK-NEXT:    adds r4, r3, #7
+; CHECK-NEXT:    vmov.8 q0[7], r4
+; CHECK-NEXT:    add.w r4, r3, #8
+; CHECK-NEXT:    vmov.8 q0[8], r4
+; CHECK-NEXT:    add.w r4, r3, #9
+; CHECK-NEXT:    vmov.8 q0[9], r4
+; CHECK-NEXT:    add.w r4, r3, #10
+; CHECK-NEXT:    vmov.8 q0[10], r4
+; CHECK-NEXT:    add.w r4, r3, #11
+; CHECK-NEXT:    vmov.8 q0[11], r4
+; CHECK-NEXT:    add.w r4, r3, #12
+; CHECK-NEXT:    vmov.8 q0[12], r4
+; CHECK-NEXT:    add.w r4, r3, #13
+; CHECK-NEXT:    vmov.8 q0[13], r4
+; CHECK-NEXT:    add.w r4, r3, #14
+; CHECK-NEXT:    adds r2, #16
+; CHECK-NEXT:    subs r0, #16
+; CHECK-NEXT:    vmov.8 q0[14], r4
+; CHECK-NEXT:    adds r3, #15
+; CHECK-NEXT:    vmov.8 q0[15], r3
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrbt.8 q0, [r12], #16
+; CHECK-NEXT:    le lr, .LBB0_2
+; CHECK-NEXT:  @ %bb.3: @ %while.end
+; CHECK-NEXT:    pop {r4, pc}
+entry:
+  %0 = bitcast i32* %b to i8*
+  %cmp3 = icmp ugt i8* %0, %c
+  br i1 %cmp3, label %while.body.preheader, label %while.end
+
+while.body.preheader:                             ; preds = %entry
+  %c5 = ptrtoint i8* %c to i32
+  %1 = sub i32 0, %c5
+  %uglygep = getelementptr i8, i8* %0, i32 %1
+  %exitcount.ptrcnt.to.int = ptrtoint i8* %uglygep to i32
+  %n.rnd.up = add i32 %exitcount.ptrcnt.to.int, 15
+  %n.vec = and i32 %n.rnd.up, -16
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %while.body.preheader
+  %index = phi i32 [ 0, %while.body.preheader ], [ %index.next, %vector.body ]
+  %next.gep = getelementptr i8, i8* %c, i32 %index
+  %2 = or i32 %index, 1
+  %next.gep7 = getelementptr i8, i8* %c, i32 %2
+  %3 = or i32 %index, 2
+  %next.gep8 = getelementptr i8, i8* %c, i32 %3
+  %4 = or i32 %index, 3
+  %next.gep9 = getelementptr i8, i8* %c, i32 %4
+  %5 = or i32 %index, 4
+  %next.gep10 = getelementptr i8, i8* %c, i32 %5
+  %6 = or i32 %index, 5
+  %next.gep11 = getelementptr i8, i8* %c, i32 %6
+  %7 = or i32 %index, 6
+  %next.gep12 = getelementptr i8, i8* %c, i32 %7
+  %8 = or i32 %index, 7
+  %next.gep13 = getelementptr i8, i8* %c, i32 %8
+  %9 = or i32 %index, 8
+  %next.gep14 = getelementptr i8, i8* %c, i32 %9
+  %10 = or i32 %index, 9
+  %next.gep15 = getelementptr i8, i8* %c, i32 %10
+  %11 = or i32 %index, 10
+  %next.gep16 = getelementptr i8, i8* %c, i32 %11
+  %12 = or i32 %index, 11
+  %next.gep17 = getelementptr i8, i8* %c, i32 %12
+  %13 = or i32 %index, 12
+  %next.gep18 = getelementptr i8, i8* %c, i32 %13
+  %14 = or i32 %index, 13
+  %next.gep19 = getelementptr i8, i8* %c, i32 %14
+  %15 = or i32 %index, 14
+  %next.gep20 = getelementptr i8, i8* %c, i32 %15
+  %16 = or i32 %index, 15
+  %next.gep21 = getelementptr i8, i8* %c, i32 %16
+  %17 = insertelement <16 x i8*> poison, i8* %next.gep, i32 0
+  %18 = insertelement <16 x i8*> %17, i8* %next.gep7, i32 1
+  %19 = insertelement <16 x i8*> %18, i8* %next.gep8, i32 2
+  %20 = insertelement <16 x i8*> %19, i8* %next.gep9, i32 3
+  %21 = insertelement <16 x i8*> %20, i8* %next.gep10, i32 4
+  %22 = insertelement <16 x i8*> %21, i8* %next.gep11, i32 5
+  %23 = insertelement <16 x i8*> %22, i8* %next.gep12, i32 6
+  %24 = insertelement <16 x i8*> %23, i8* %next.gep13, i32 7
+  %25 = insertelement <16 x i8*> %24, i8* %next.gep14, i32 8
+  %26 = insertelement <16 x i8*> %25, i8* %next.gep15, i32 9
+  %27 = insertelement <16 x i8*> %26, i8* %next.gep16, i32 10
+  %28 = insertelement <16 x i8*> %27, i8* %next.gep17, i32 11
+  %29 = insertelement <16 x i8*> %28, i8* %next.gep18, i32 12
+  %30 = insertelement <16 x i8*> %29, i8* %next.gep19, i32 13
+  %31 = insertelement <16 x i8*> %30, i8* %next.gep20, i32 14
+  %32 = insertelement <16 x i8*> %31, i8* %next.gep21, i32 15
+  %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %exitcount.ptrcnt.to.int)
+  %33 = ptrtoint <16 x i8*> %32 to <16 x i32>
+  %34 = trunc <16 x i32> %33 to <16 x i8>
+  %35 = bitcast i8* %next.gep to <16 x i8>*
+  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %34, <16 x i8>* %35, i32 1, <16 x i1> %active.lane.mask)
+  %index.next = add i32 %index, 16
+  %36 = icmp eq i32 %index.next, %n.vec
+  br i1 %36, label %while.end, label %vector.body
+
+while.end:                                        ; preds = %vector.body, %entry
+  ret i32 undef
+}
+
+declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)
+declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)

From 1a8f0b969c4e77e32fe88b9e5de257fe96a3307d Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Sat, 22 May 2021 13:38:00 +0100
Subject: [PATCH 314/318] [ARM] Clean up some tests, removing dead
 instructions. NFC

---
 .../cond-vector-reduce-mve-codegen.ll         |  82 ++-------
 .../LowOverheadLoops/extending-loads.ll       |  44 -----
 .../Thumb2/LowOverheadLoops/fast-fp-loops.ll  |  22 ---
 .../LowOverheadLoops/mve-tail-data-types.ll   | 121 --------------
 .../CodeGen/Thumb2/LowOverheadLoops/nested.ll |  30 ----
 .../Thumb2/LowOverheadLoops/reductions.ll     |  10 --
 .../LowOverheadLoops/tail-pred-basic.ll       |  45 -----
 .../LowOverheadLoops/tail-pred-const.ll       |  40 -----
 .../tail-pred-disabled-in-loloops.ll          |   4 -
 .../tail-pred-intrinsic-add-sat.ll            |   2 -
 .../tail-pred-intrinsic-fabs.ll               |   1 -
 .../tail-pred-intrinsic-round.ll              |   6 -
 .../tail-pred-intrinsic-sub-sat.ll            |   2 -
 .../LowOverheadLoops/tail-pred-reduce.ll      |  25 ---
 .../LowOverheadLoops/tail-pred-widen.ll       |  28 +---
 .../varying-outer-2d-reduction.ll             |   1 -
 .../LowOverheadLoops/vector-arith-codegen.ll  |  89 +---------
 .../vector-reduce-mve-tail.ll                 |   9 -
 llvm/test/CodeGen/Thumb2/mve-fma-loops.ll     | 156 ++----------------
 .../CodeGen/Thumb2/mve-gather-increment.ll    |   2 -
 .../Thumb2/mve-gather-optimisation-deep.ll    |  29 ++--
 .../Thumb2/mve-gather-scatter-optimisation.ll |  11 --
 .../CodeGen/Thumb2/mve-vecreduce-loops.ll     |  15 --
 23 files changed, 39 insertions(+), 735 deletions(-)

diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
index 29ecf00c556f..e36e219ebaf1 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
@@ -48,22 +48,13 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
-
-;  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
   %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %tmp2 = bitcast i32* %tmp to <4 x i32>*
   %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
@@ -147,22 +138,13 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
-
-;  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
   %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %tmp2 = bitcast i32* %tmp to <4 x i32>*
   %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
@@ -205,13 +187,12 @@ define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32
 ; CHECK-NEXT:    cmp.w r12, #0
 ; CHECK-NEXT:    beq .LBB2_4
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
-; CHECK-NEXT:    add.w r4, r12, #3
-; CHECK-NEXT:    vmov.i32 q1, #0x0
-; CHECK-NEXT:    bic r4, r4, #3
-; CHECK-NEXT:    sub.w lr, r4, #4
+; CHECK-NEXT:    add.w lr, r12, #3
 ; CHECK-NEXT:    movs r4, #1
+; CHECK-NEXT:    bic lr, lr, #3
+; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    sub.w lr, lr, #4
 ; CHECK-NEXT:    add.w lr, r4, lr, lsr #2
-; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB2_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
@@ -222,12 +203,11 @@ define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32
 ; CHECK-NEXT:    vldrwt.u32 q2, [r0], #16
 ; CHECK-NEXT:    vstr p0, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    vsub.i32 q1, q2, q1
-; CHECK-NEXT:    adds r4, #4
+; CHECK-NEXT:    sub.w r12, r12, #4
 ; CHECK-NEXT:    vpsttt
 ; CHECK-NEXT:    vcmpt.i32 eq, q1, zr
 ; CHECK-NEXT:    vldrwt.u32 q1, [r3], #16
 ; CHECK-NEXT:    vldrwt.u32 q2, [r2], #16
-; CHECK-NEXT:    sub.w r12, r12, #4
 ; CHECK-NEXT:    vmul.i32 q1, q2, q1
 ; CHECK-NEXT:    vadd.i32 q1, q1, q0
 ; CHECK-NEXT:    le lr, .LBB2_2
@@ -249,22 +229,13 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
-
-;  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
   %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %tmp2 = bitcast i32* %tmp to <4 x i32>*
   %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
@@ -304,13 +275,12 @@ define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32*
 ; CHECK-NEXT:    cmp.w r12, #0
 ; CHECK-NEXT:    beq .LBB3_4
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
-; CHECK-NEXT:    add.w r4, r12, #3
-; CHECK-NEXT:    vmov.i32 q1, #0x0
-; CHECK-NEXT:    bic r4, r4, #3
-; CHECK-NEXT:    sub.w lr, r4, #4
+; CHECK-NEXT:    add.w lr, r12, #3
 ; CHECK-NEXT:    movs r4, #1
+; CHECK-NEXT:    bic lr, lr, #3
+; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    sub.w lr, lr, #4
 ; CHECK-NEXT:    add.w lr, r4, lr, lsr #2
-; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB3_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
@@ -326,9 +296,8 @@ define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32*
 ; CHECK-NEXT:    vcmpt.i32 ne, q1, zr
 ; CHECK-NEXT:    vldrwe.u32 q1, [r3], #16
 ; CHECK-NEXT:    vldrwe.u32 q2, [r2], #16
-; CHECK-NEXT:    adds r4, #4
-; CHECK-NEXT:    vmul.i32 q1, q2, q1
 ; CHECK-NEXT:    sub.w r12, r12, #4
+; CHECK-NEXT:    vmul.i32 q1, q2, q1
 ; CHECK-NEXT:    vadd.i32 q1, q1, q0
 ; CHECK-NEXT:    le lr, .LBB3_2
 ; CHECK-NEXT:  @ %bb.3: @ %middle.block
@@ -348,22 +317,13 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
-
-;  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
   %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %tmp2 = bitcast i32* %tmp to <4 x i32>*
   %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
@@ -402,11 +362,9 @@ define dso_local void @continue_on_zero(i32* noalias nocapture %arg, i32* noalia
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB4_1: @ %bb3
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    dlstp.32 lr, r2
 ; CHECK-NEXT:  .LBB4_2: @ %bb9
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r3, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
 ; CHECK-NEXT:    vpt.i32 ne, q0, zr
 ; CHECK-NEXT:    vldrwt.u32 q1, [r0]
@@ -423,21 +381,12 @@ bb:
 bb3:                                              ; preds = %bb
   %tmp4 = add i32 %arg2, 3
   %tmp5 = and i32 %tmp4, -4
-  %tmp6 = add i32 %arg2, -1
-  %tmp7 = insertelement <4 x i32> undef, i32 %tmp6, i32 0
-  %tmp8 = shufflevector <4 x i32> %tmp7, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %bb9
 
 bb9:                                              ; preds = %bb9, %bb3
   %tmp10 = phi i32 [ 0, %bb3 ], [ %tmp25, %bb9 ]
-  %tmp11 = insertelement <4 x i32> undef, i32 %tmp10, i32 0
-  %tmp12 = shufflevector <4 x i32> %tmp11, <4 x i32> undef, <4 x i32> zeroinitializer
-  %tmp13 = add <4 x i32> %tmp12, <i32 0, i32 1, i32 2, i32 3>
   %tmp14 = getelementptr inbounds i32, i32* %arg1, i32 %tmp10
-
-  ; %tmp15 = icmp ule <4 x i32> %tmp13, %tmp8
   %tmp15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %tmp10, i32 %arg2)
-
   %tmp16 = bitcast i32* %tmp14 to <4 x i32>*
   %tmp17 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp16, i32 4, <4 x i1> %tmp15, <4 x i32> undef)
   %tmp18 = icmp ne <4 x i32> %tmp17, zeroinitializer
@@ -464,7 +413,6 @@ define dso_local arm_aapcs_vfpcc void @range_test(i32* noalias nocapture %arg, i
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB5_1: @ %bb4
-; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB5_2: @ %bb12
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
@@ -472,7 +420,6 @@ define dso_local arm_aapcs_vfpcc void @range_test(i32* noalias nocapture %arg, i
 ; CHECK-NEXT:    vptt.i32 ne, q0, zr
 ; CHECK-NEXT:    vcmpt.s32 le, q0, r2
 ; CHECK-NEXT:    vldrwt.u32 q1, [r1], #16
-; CHECK-NEXT:    add.w r12, r12, #4
 ; CHECK-NEXT:    vmul.i32 q0, q1, q0
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vstrwt.32 q0, [r0], #16
@@ -486,23 +433,14 @@ bb:
 bb4:                                              ; preds = %bb
   %tmp5 = add i32 %arg3, 3
   %tmp6 = and i32 %tmp5, -4
-  %tmp7 = add i32 %arg3, -1
-  %tmp8 = insertelement <4 x i32> undef, i32 %tmp7, i32 0
-  %tmp9 = shufflevector <4 x i32> %tmp8, <4 x i32> undef, <4 x i32> zeroinitializer
   %tmp10 = insertelement <4 x i32> undef, i32 %arg2, i32 0
   %tmp11 = shufflevector <4 x i32> %tmp10, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %bb12
 
 bb12:                                             ; preds = %bb12, %bb4
   %tmp13 = phi i32 [ 0, %bb4 ], [ %tmp30, %bb12 ]
-  %tmp14 = insertelement <4 x i32> undef, i32 %tmp13, i32 0
-  %tmp15 = shufflevector <4 x i32> %tmp14, <4 x i32> undef, <4 x i32> zeroinitializer
-  %tmp16 = add <4 x i32> %tmp15, <i32 0, i32 1, i32 2, i32 3>
   %tmp17 = getelementptr inbounds i32, i32* %arg, i32 %tmp13
-
-  ; %tmp18 = icmp ule <4 x i32> %tmp16, %tmp9
   %tmp18= call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %tmp13, i32 %arg3)
-
   %tmp19 = bitcast i32* %tmp17 to <4 x i32>*
   %tmp20 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp19, i32 4, <4 x i1> %tmp18, <4 x i32> undef)
   %tmp21 = icmp ne <4 x i32> %tmp20, zeroinitializer
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll
index 2627965913eb..01564487b576 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll
@@ -9,11 +9,9 @@ define dso_local arm_aapcs_vfpcc void @sext_i8(i16* noalias nocapture %a, i8* no
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB0_1: @ %vector.ph
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    dlstp.16 lr, r2
 ; CHECK-NEXT:  .LBB0_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r3, #8
 ; CHECK-NEXT:    vldrb.s16 q0, [r1], #8
 ; CHECK-NEXT:    vldrh.u16 q1, [r0]
 ; CHECK-NEXT:    vadd.i16 q0, q1, q0
@@ -28,21 +26,12 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
-  %induction = or <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %0 = getelementptr inbounds i8, i8* %b, i32 %index
-
-  ; %1 = icmp ule <8 x i32> %induction, %broadcast.splat11
   %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i8* %0 to <8 x i8>*
   %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef)
   %3 = sext <8 x i8> %wide.masked.load to <8 x i16>
@@ -69,11 +58,9 @@ define dso_local arm_aapcs_vfpcc void @zext_i8(i16* noalias nocapture %a, i8* no
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB1_1: @ %vector.ph
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    dlstp.16 lr, r2
 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r3, #8
 ; CHECK-NEXT:    vldrb.u16 q0, [r1], #8
 ; CHECK-NEXT:    vldrh.u16 q1, [r0]
 ; CHECK-NEXT:    vadd.i16 q0, q1, q0
@@ -88,21 +75,12 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
-  %induction = or <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %0 = getelementptr inbounds i8, i8* %b, i32 %index
-
-  ; %1 = icmp ule <8 x i32> %induction, %broadcast.splat11
   %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i8* %0 to <8 x i8>*
   %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef)
   %3 = zext <8 x i8> %wide.masked.load to <8 x i16>
@@ -129,11 +107,9 @@ define dso_local arm_aapcs_vfpcc void @sext_i16(i32* noalias nocapture %a, i16*
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB2_1: @ %vector.ph
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    dlstp.32 lr, r2
 ; CHECK-NEXT:  .LBB2_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r3, #4
 ; CHECK-NEXT:    vldrh.s32 q0, [r1], #8
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vadd.i32 q0, q1, q0
@@ -148,21 +124,12 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds i16, i16* %b, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i16* %0 to <4 x i16>*
   %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
   %3 = sext <4 x i16> %wide.masked.load to <4 x i32>
@@ -189,11 +156,9 @@ define dso_local arm_aapcs_vfpcc void @zext_i16(i32* noalias nocapture %a, i16*
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB3_1: @ %vector.ph
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    dlstp.32 lr, r2
 ; CHECK-NEXT:  .LBB3_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r3, #4
 ; CHECK-NEXT:    vldrh.u32 q0, [r1], #8
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vadd.i32 q0, q1, q0
@@ -208,21 +173,12 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds i16, i16* %b, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i16* %0 to <4 x i16>*
   %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
   %3 = zext <4 x i16> %wide.masked.load to <4 x i32>
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
index 273e1d5dd360..03e56812e273 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
@@ -34,11 +34,9 @@ define arm_aapcs_vfpcc void @fast_float_mul(float* nocapture %a, float* nocaptur
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    b .LBB0_8
 ; CHECK-NEXT:  .LBB0_4: @ %vector.ph
-; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB0_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add.w r12, r12, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
 ; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
 ; CHECK-NEXT:    vmul.f32 q0, q1, q0
@@ -122,21 +120,12 @@ for.body.preheader.new:                           ; preds = %for.body.preheader
 vector.ph:                                        ; preds = %vector.memcheck
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert21 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat22 = shufflevector <4 x i32> %broadcast.splatinsert21, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %2 = getelementptr inbounds float, float* %b, i32 %index
-
-  ; %3 = icmp ule <4 x i32> %induction, %broadcast.splat22
   %3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %4 = bitcast float* %2 to <4 x float>*
   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %3, <4 x float> undef)
   %5 = getelementptr inbounds float, float* %c, i32 %index
@@ -225,12 +214,10 @@ define arm_aapcs_vfpcc float @fast_float_mac(float* nocapture readonly %b, float
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    subs r3, #4
 ; CHECK-NEXT:    add.w lr, r12, r3, lsr #2
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
-; CHECK-NEXT:    adds r3, #4
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vpstt
@@ -262,22 +249,13 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %6, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds float, float* %b, i32 %index
-
-;  %1 = icmp ule <4 x i32> %induction, %broadcast.splat12
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast float* %0 to <4 x float>*
   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
   %3 = getelementptr inbounds float, float* %c, i32 %index
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
index e2136c5c9483..77ae179851b3 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
@@ -16,12 +16,10 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_char(i8 zeroext %a, i8* nocapture re
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB0_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
-; CHECK-NEXT:    adds r3, #4
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vpst
@@ -40,9 +38,6 @@ vector.ph:                                        ; preds = %entry
   %conv = zext i8 %a to i32
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %conv, i32 0
   %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
@@ -50,14 +45,8 @@ vector.ph:                                        ; preds = %entry
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds i8, i8* %b, i32 %index
-
-; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i8* %0 to <4 x i8>*
   %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %2, i32 1, <4 x i1> %1, <4 x i8> undef)
   %3 = zext <4 x i8> %wide.masked.load to <4 x i32>
@@ -92,12 +81,10 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_short(i16 signext %a, i16* nocapture
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
-; CHECK-NEXT:    adds r3, #4
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vpst
@@ -116,9 +103,6 @@ vector.ph:                                        ; preds = %entry
   %conv = sext i16 %a to i32
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %conv, i32 0
   %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
@@ -126,14 +110,8 @@ vector.ph:                                        ; preds = %entry
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds i16, i16* %b, i32 %index
-
-;  %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i16* %0 to <4 x i16>*
   %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
   %3 = sext <4 x i16> %wide.masked.load to <4 x i32>
@@ -168,12 +146,10 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_uchar(i8 zeroext %a, i8* nocapture r
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB2_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
-; CHECK-NEXT:    adds r3, #4
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vpst
@@ -192,9 +168,6 @@ vector.ph:                                        ; preds = %entry
   %conv = zext i8 %a to i32
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %conv, i32 0
   %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
@@ -202,14 +175,8 @@ vector.ph:                                        ; preds = %entry
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds i8, i8* %b, i32 %index
-
-;  %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i8* %0 to <4 x i8>*
   %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %2, i32 1, <4 x i1> %1, <4 x i8> undef)
   %3 = zext <4 x i8> %wide.masked.load to <4 x i32>
@@ -244,12 +211,10 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_ushort(i16 signext %a, i16* nocaptur
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB3_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
-; CHECK-NEXT:    adds r3, #4
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vpst
@@ -268,9 +233,6 @@ vector.ph:                                        ; preds = %entry
   %conv = sext i16 %a to i32
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %conv, i32 0
   %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
@@ -278,14 +240,8 @@ vector.ph:                                        ; preds = %entry
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds i16, i16* %b, i32 %index
-
-;  %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i16* %0 to <4 x i16>*
   %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
   %3 = zext <4 x i16> %wide.masked.load to <4 x i32>
@@ -320,12 +276,10 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_int(i32 %a, i32* nocapture readonly
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB4_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
-; CHECK-NEXT:    adds r3, #4
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vpst
@@ -343,9 +297,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %a, i32 0
   %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
@@ -353,14 +304,8 @@ vector.ph:                                        ; preds = %entry
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %4, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds i32, i32* %b, i32 %index
-
-;  %1 = icmp ule <4 x i32> %induction, %broadcast.splat10
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i32* %0 to <4 x i32>*
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
   %3 = mul nsw <4 x i32> %wide.masked.load, %broadcast.splat12
@@ -413,11 +358,9 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly
 ; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    b .LBB5_8
 ; CHECK-NEXT:  .LBB5_4: @ %vector.ph
-; CHECK-NEXT:    movs r7, #0
 ; CHECK-NEXT:    dlstp.32 lr, r12
 ; CHECK-NEXT:  .LBB5_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r7, #4
 ; CHECK-NEXT:    vldrb.u32 q0, [r0], #4
 ; CHECK-NEXT:    vldrb.u32 q1, [r1], #4
 ; CHECK-NEXT:    vmlas.u32 q1, q0, r2
@@ -500,23 +443,14 @@ for.body.preheader.new:                           ; preds = %for.body.preheader
 vector.ph:                                        ; preds = %for.body.lr.ph
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert19 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat20 = shufflevector <4 x i32> %broadcast.splatinsert19, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert22 = insertelement <4 x i32> undef, i32 %conv3, i32 0
   %broadcast.splat23 = shufflevector <4 x i32> %broadcast.splatinsert22, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %2 = getelementptr inbounds i8, i8* %a, i32 %index
-
-  ; %3 = icmp ule <4 x i32> %induction, %broadcast.splat20
   %3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %4 = bitcast i8* %2 to <4 x i8>*
   %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %4, i32 1, <4 x i1> %3, <4 x i8> undef)
   %5 = zext <4 x i8> %wide.masked.load to <4 x i32>
@@ -620,11 +554,9 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_short(i16* nocapture readon
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r4, pc}
 ; CHECK-NEXT:  .LBB6_1: @ %vector.ph
-; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    dlstp.32 lr, r12
 ; CHECK-NEXT:  .LBB6_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r4, #4
 ; CHECK-NEXT:    vldrh.s32 q0, [r0], #8
 ; CHECK-NEXT:    vldrh.s32 q1, [r1], #8
 ; CHECK-NEXT:    vmlas.u32 q1, q0, r2
@@ -640,23 +572,14 @@ vector.ph:                                        ; preds = %entry
   %conv3 = sext i16 %c to i32
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert15 = insertelement <4 x i32> undef, i32 %conv3, i32 0
   %broadcast.splat16 = shufflevector <4 x i32> %broadcast.splatinsert15, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds i16, i16* %a, i32 %index
-
-;  %1 = icmp ule <4 x i32> %induction, %broadcast.splat13
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i16* %0 to <4 x i16>*
   %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
   %3 = sext <4 x i16> %wide.masked.load to <4 x i32>
@@ -711,11 +634,9 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl
 ; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    b .LBB7_8
 ; CHECK-NEXT:  .LBB7_4: @ %vector.ph
-; CHECK-NEXT:    movs r7, #0
 ; CHECK-NEXT:    dlstp.32 lr, r12
 ; CHECK-NEXT:  .LBB7_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r7, #4
 ; CHECK-NEXT:    vldrb.u32 q0, [r0], #4
 ; CHECK-NEXT:    vldrb.u32 q1, [r1], #4
 ; CHECK-NEXT:    vmlas.u32 q1, q0, r2
@@ -798,23 +719,14 @@ for.body.preheader.new:                           ; preds = %for.body.preheader
 vector.ph:                                        ; preds = %for.body.lr.ph
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert19 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat20 = shufflevector <4 x i32> %broadcast.splatinsert19, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert22 = insertelement <4 x i32> undef, i32 %conv3, i32 0
   %broadcast.splat23 = shufflevector <4 x i32> %broadcast.splatinsert22, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %2 = getelementptr inbounds i8, i8* %a, i32 %index
-
-;  %3 = icmp ule <4 x i32> %induction, %broadcast.splat20
   %3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %4 = bitcast i8* %2 to <4 x i8>*
   %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %4, i32 1, <4 x i1> %3, <4 x i8> undef)
   %5 = zext <4 x i8> %wide.masked.load to <4 x i32>
@@ -918,11 +830,9 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_ushort(i16* nocapture reado
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r4, pc}
 ; CHECK-NEXT:  .LBB8_1: @ %vector.ph
-; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    dlstp.32 lr, r12
 ; CHECK-NEXT:  .LBB8_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r4, #4
 ; CHECK-NEXT:    vldrh.u32 q0, [r0], #8
 ; CHECK-NEXT:    vldrh.u32 q1, [r1], #8
 ; CHECK-NEXT:    vmlas.u32 q1, q0, r2
@@ -938,23 +848,14 @@ vector.ph:                                        ; preds = %entry
   %conv3 = sext i16 %c to i32
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert15 = insertelement <4 x i32> undef, i32 %conv3, i32 0
   %broadcast.splat16 = shufflevector <4 x i32> %broadcast.splatinsert15, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds i16, i16* %a, i32 %index
-
-;  %1 = icmp ule <4 x i32> %induction, %broadcast.splat13
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i16* %0 to <4 x i16>*
   %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
   %3 = zext <4 x i16> %wide.masked.load to <4 x i32>
@@ -1009,11 +910,9 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(i32* nocapture readonly
 ; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    b .LBB9_8
 ; CHECK-NEXT:  .LBB9_4: @ %vector.ph
-; CHECK-NEXT:    movs r7, #0
 ; CHECK-NEXT:    dlstp.32 lr, r12
 ; CHECK-NEXT:  .LBB9_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r7, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
 ; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
 ; CHECK-NEXT:    vmlas.u32 q1, q0, r2
@@ -1095,23 +994,14 @@ for.body.preheader.new:                           ; preds = %for.body.preheader
 vector.ph:                                        ; preds = %vector.memcheck
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert21 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat22 = shufflevector <4 x i32> %broadcast.splatinsert21, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert24 = insertelement <4 x i32> undef, i32 %c, i32 0
   %broadcast.splat25 = shufflevector <4 x i32> %broadcast.splatinsert24, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %2 = getelementptr inbounds i32, i32* %a, i32 %index
-
-;  %3 = icmp ule <4 x i32> %induction, %broadcast.splat22
   %3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %4 = bitcast i32* %2 to <4 x i32>*
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %3, <4 x i32> undef)
   %5 = getelementptr inbounds i32, i32* %b, i32 %index
@@ -1202,11 +1092,9 @@ define dso_local arm_aapcs_vfpcc void @test_v8i8_to_v8i16(i16* noalias nocapture
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB10_1: @ %vector.ph
-; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.16 lr, r3
 ; CHECK-NEXT:  .LBB10_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add.w r12, r12, #8
 ; CHECK-NEXT:    vldrb.u16 q0, [r1], #8
 ; CHECK-NEXT:    vldrb.u16 q1, [r2], #8
 ; CHECK-NEXT:    vmul.i16 q0, q1, q0
@@ -1221,21 +1109,12 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert12 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat13 = shufflevector <8 x i32> %broadcast.splatinsert12, <8 x i32> undef, <8 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
-  %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %0 = getelementptr inbounds i8, i8* %b, i32 %index
-
-;  %1 = icmp ule <8 x i32> %induction, %broadcast.splat13
   %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i8* %0 to <8 x i8>*
   %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef)
   %3 = zext <8 x i8> %wide.masked.load to <8 x i16>
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
index c8d38032a6a4..6b71a070d651 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
@@ -9,9 +9,6 @@ define void @mat_vec_sext_i16(i16** nocapture readonly %A, i16* nocapture readon
 ; CHECK:       for.cond1.preheader.us.preheader:
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 3
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4
-; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT29:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT28]], <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TT:%.*]] = add i32 [[N_VEC]], -4
 ; CHECK-NEXT:    [[TT1:%.*]] = lshr i32 [[TT]], 2
 ; CHECK-NEXT:    [[TT2:%.*]] = add nuw nsw i32 [[TT1]], 1
@@ -30,9 +27,6 @@ define void @mat_vec_sext_i16(i16** nocapture readonly %A, i16* nocapture readon
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TT4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TT14:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TT5:%.*]] = phi i32 [ [[START]], [[FOR_COND1_PREHEADER_US]] ], [ [[TT15:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TT6:%.*]] = getelementptr inbounds i16, i16* [[TT3]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 4
@@ -66,9 +60,6 @@ entry:
 for.cond1.preheader.us.preheader:                 ; preds = %entry
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert28 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat29 = shufflevector <4 x i32> %broadcast.splatinsert28, <4 x i32> undef, <4 x i32> zeroinitializer
   %tt = add i32 %n.vec, -4
   %tt1 = lshr i32 %tt, 2
   %tt2 = add nuw nsw i32 %tt1, 1
@@ -88,14 +79,8 @@ vector.body:                                      ; preds = %vector.body, %for.c
   %index = phi i32 [ 0, %for.cond1.preheader.us ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x i32> [ %tt4, %for.cond1.preheader.us ], [ %tt14, %vector.body ]
   %tt5 = phi i32 [ %start, %for.cond1.preheader.us ], [ %tt15, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tt6 = getelementptr inbounds i16, i16* %tt3, i32 %index
-
-  ; %tt7 = icmp ule <4 x i32> %induction, %broadcast.splat29
   %tt7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %tt8 = bitcast i16* %tt6 to <4 x i16>*
   %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %tt8, i32 2, <4 x i1> %tt7, <4 x i16> undef)
   %tt9 = sext <4 x i16> %wide.masked.load to <4 x i32>
@@ -130,9 +115,6 @@ define void @mat_vec_i32(i32** nocapture readonly %A, i32* nocapture readonly %B
 ; CHECK:       for.cond1.preheader.us.preheader:
 ; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 3
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4
-; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT27:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT28:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT27]], <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TT:%.*]] = add i32 [[N_VEC]], -4
 ; CHECK-NEXT:    [[TT1:%.*]] = lshr i32 [[TT]], 2
 ; CHECK-NEXT:    [[TT2:%.*]] = add nuw nsw i32 [[TT1]], 1
@@ -151,9 +133,6 @@ define void @mat_vec_i32(i32** nocapture readonly %A, i32* nocapture readonly %B
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TT4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TT12:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TT5:%.*]] = phi i32 [ [[START]], [[FOR_COND1_PREHEADER_US]] ], [ [[TT13:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TT6:%.*]] = getelementptr inbounds i32, i32* [[TT3]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 4
@@ -185,9 +164,6 @@ entry:
 for.cond1.preheader.us.preheader:                 ; preds = %entry
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert27 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat28 = shufflevector <4 x i32> %broadcast.splatinsert27, <4 x i32> undef, <4 x i32> zeroinitializer
   %tt = add i32 %n.vec, -4
   %tt1 = lshr i32 %tt, 2
   %tt2 = add nuw nsw i32 %tt1, 1
@@ -207,14 +183,8 @@ vector.body:                                      ; preds = %vector.body, %for.c
   %index = phi i32 [ 0, %for.cond1.preheader.us ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x i32> [ %tt4, %for.cond1.preheader.us ], [ %tt12, %vector.body ]
   %tt5 = phi i32 [ %start, %for.cond1.preheader.us ], [ %tt13, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tt6 = getelementptr inbounds i32, i32* %tt3, i32 %index
-
-  ; %tt7 = icmp ule <4 x i32> %induction, %broadcast.splat28
   %tt7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %tt8 = bitcast i32* %tt6 to <4 x i32>*
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tt8, i32 4, <4 x i1> %tt7, <4 x i32> undef)
   %tt9 = getelementptr inbounds i32, i32* %B, i32 %index
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
index 778b50150128..7469a01500d9 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
@@ -30,7 +30,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 15
   %n.vec = and i32 %n.rnd.up, -16
-  %trip.count.minus.1 = add i32 %N, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -100,7 +99,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %N, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -172,7 +170,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 15
   %n.vec = and i32 %n.rnd.up, -16
-  %trip.count.minus.1 = add i32 %N, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -242,7 +239,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %N, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -314,7 +310,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 15
   %n.vec = and i32 %n.rnd.up, -16
-  %trip.count.minus.1 = add i32 %N, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -384,7 +379,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %N, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -481,7 +475,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -510,7 +503,6 @@ middle.block:                                     ; preds = %vector.body
 vector.ph47:                                      ; preds = %middle.block
   %n.rnd.up48 = add i32 %N, 3
   %n.vec50 = and i32 %n.rnd.up48, -4
-  %trip.count.minus.154 = add i32 %N, -1
   %i11 = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 %i10, i32 0
   br label %vector.body46
 
@@ -594,7 +586,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %N, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -719,7 +710,6 @@ lor.end:                                          ; preds = %entry, %lor.rhs
 vector.ph:                                        ; preds = %lor.end
   %n.rnd.up = add i32 %4, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %4, -1
   %5 = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 %0, i32 0
   br label %vector.body
 
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll
index 1492a01a272e..d10cbffe2dd2 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll
@@ -22,23 +22,14 @@ entry:
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 
 vector.ph:                                        ; preds = %entry
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert10 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <16 x i32> %broadcast.splatinsert10, <16 x i32> undef, <16 x i32> zeroinitializer
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
-  %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer
-  %induction = or <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %tmp = getelementptr inbounds i8, i8* %a, i32 %index
-
-;  %tmp1 = icmp ule <16 x i32> %induction, %broadcast.splat11
   %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N)
-
   %tmp2 = bitcast i8* %tmp to <16 x i8>*
   %wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp2, i32 4, <16 x i1> %active.lane.mask, <16 x i8> undef)
   %tmp3 = getelementptr inbounds i8, i8* %b, i32 %index
@@ -79,23 +70,14 @@ entry:
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 
 vector.ph:                                        ; preds = %entry
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
-  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
-  %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %tmp = getelementptr inbounds i16, i16* %a, i32 %index
-
-;  %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11
   %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
-
   %tmp2 = bitcast i16* %tmp to <8 x i16>*
   %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %active.lane.mask, <8 x i16> undef)
   %tmp3 = getelementptr inbounds i16, i16* %b, i32 %index
@@ -135,20 +117,13 @@ entry:
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 
 vector.ph:                                        ; preds = %entry
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
- ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %tmp2 = bitcast i32* %tmp to <4 x i32>*
   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
@@ -190,20 +165,13 @@ entry:
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 
 vector.ph:                                        ; preds = %entry
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
-;  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
   %tmp2 = bitcast i32* %tmp to <4 x i32>*
   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
@@ -262,10 +230,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
-
-;  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11
   %tmp2 = bitcast i32* %tmp to <4 x i32>*
   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
@@ -321,10 +286,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
-
-;  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11
   %tmp2 = bitcast i32* %tmp to <4 x i32>*
   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
@@ -370,7 +332,6 @@ entry:
 
 
 vector.ph:
-  %trip.count.minus.1 = add i32 %N, -1
   %scevgep = getelementptr i32, i32* %A, i32 8
   %scevgep30 = getelementptr i32, i32* %C, i32 8
   %scevgep37 = getelementptr i32, i32* %B, i32 8
@@ -459,9 +420,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
   %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
-
   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 42)
-
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
@@ -495,7 +454,6 @@ entry:
   br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
 
 vector.ph:                                        ; preds = %entry
-  %trip.count.minus.1 = add i32 %N, -1
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
   br label %vector.body
 
@@ -509,9 +467,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
   %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
-
   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %index)
-
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
@@ -546,7 +502,6 @@ entry:
   br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
 
 vector.ph:                                        ; preds = %entry
-  %trip.count.minus.1 = add i32 %N, -1
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
   br label %vector.body
 
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
index 4682f1d36f31..1c173e9dfd1d 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
@@ -10,22 +10,17 @@ define dso_local void @foo(i32* noalias nocapture %A, i32* noalias nocapture rea
 ; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ 32003, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>*
 ; CHECK-NEXT:    [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>*
 ; CHECK-NEXT:    [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>*
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP1]])
 ; CHECK-NEXT:    [[TMP3]] = sub i32 [[TMP1]], 4
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV10]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV1113]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP4]], <4 x i32>* [[LSR_IV1416]], i32 4, <4 x i1> [[TMP2]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, i32* [[LSR_IV11]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, i32* [[LSR_IV14]], i32 4
@@ -48,13 +43,7 @@ vector.body:
   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
   %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
   %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
-
-  ; %1 = icmp ult <4 x i32> %induction, <i32 32002, i32 32002, i32 32002, i32 32002>
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
-
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
   %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
   %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
@@ -244,11 +233,7 @@ vector.body:
   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
-
-; Non-uniform constant vector here. This can't be represented with
-; @llvm.get.active.lane.mask, but let's keep this test as a sanity check:
   %1 = icmp ult <4 x i32> %induction, <i32 0, i32 3200, i32 32002, i32 32002>
-
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
   %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
   %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
@@ -285,13 +270,8 @@ vector.body:
   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
   %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
   %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
-
 ; BTC = UINT_MAX, and scalar trip count BTC + 1 would overflow:
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 4294967295)
-
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
   %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
   %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
@@ -328,12 +308,7 @@ vector.body:
   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
   %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
   %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
-
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
-
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
   %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
   %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
@@ -371,13 +346,8 @@ vector.body:
   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
   %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
   %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
-
 ; The induction variable %N is not an IV:
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %N, i32 32003)
-
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
   %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
   %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
@@ -414,12 +384,7 @@ vector.body:
   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
   %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
   %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
-
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
-
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
   %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
   %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
@@ -460,9 +425,6 @@ vector.body:
   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
   %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
   %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
   %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
@@ -514,10 +476,8 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %lsr.iv3840 = bitcast i32* %lsr.iv38 to <4 x i32>*
   %lsr.iv3335 = bitcast i32* %lsr.iv33 to <4 x i32>*
   %lsr.iv2830 = bitcast i32* %lsr.iv28 to <4 x i32>*
-
 ; It's using %j.025, the induction variable from its outer loop:
   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %j.025, i32 4096)
-
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv3840, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %wide.masked.load27 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv3335, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %1 = add nsw <4 x i32> %wide.masked.load27, %wide.masked.load
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll
index a2361f518636..73865945cdc3 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll
@@ -82,7 +82,6 @@ entry:
   br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
 
 vector.ph:                                        ; preds = %entry
-  %trip.count.minus.1 = add i32 %N, -1
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
   br label %vector.body
 
@@ -92,13 +91,10 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ]
-
   %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
   %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
-
   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-add-sat.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-add-sat.ll
index c0b2a036f371..14f1d0c00204 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-add-sat.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-add-sat.ll
@@ -27,7 +27,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %blockSize, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %blockSize, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -77,7 +76,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %blockSize, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %blockSize, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-fabs.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-fabs.ll
index 5ad6d9112308..66216022d647 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-fabs.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-fabs.ll
@@ -26,7 +26,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %blockSize, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %blockSize, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll
index bd927fdcf859..024857b65802 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll
@@ -26,7 +26,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -72,7 +71,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -118,7 +116,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -164,7 +161,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -210,7 +206,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -260,7 +255,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-sub-sat.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-sub-sat.ll
index 98d48d49539c..0e51661e8f58 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-sub-sat.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-sub-sat.ll
@@ -27,7 +27,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %blockSize, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %blockSize, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -77,7 +76,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %blockSize, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %blockSize, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll
index ef79f27ce5dc..01bbd3ac28e2 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll
@@ -18,8 +18,6 @@ vector.ph:
   %tmp = add i32 %N, -1
   %n.rnd.up = add i32 %tmp, 8
   %n.vec = and i32 %n.rnd.up, -8
-  %broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %tmp, i32 0
-  %broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer
   %0 = add i32 %n.vec, -8
   %1 = lshr i32 %0, 3
   %2 = add i32 %1, 1
@@ -30,14 +28,8 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %index = phi i32 [ 0, %vector.ph], [ %index.next, %vector.body ]
   %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph], [ %tmp8, %vector.body ]
   %3 = phi i32 [ %start, %vector.ph], [ %4, %vector.body ]
-  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
-  %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %tmp2 = getelementptr inbounds i16, i16* %A, i32 %index
-
-  ; %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2
   %tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
-
   %tmp4 = bitcast i16* %tmp2 to <8 x i16>*
   %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef)
   %tmp5 = getelementptr inbounds i16, i16* %B, i32 %index
@@ -87,8 +79,6 @@ vector.ph:
   %tmp = add i32 %N, -1
   %n.rnd.up = add nuw nsw i32 %tmp, 8
   %n.vec = and i32 %n.rnd.up, -8
-  %broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %tmp, i32 0
-  %broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer
   %broadcast.splatinsert3 = insertelement <8 x i16> undef, i16 %B, i32 0
   %broadcast.splat4 = shufflevector <8 x i16> %broadcast.splatinsert3, <8 x i16> undef, <8 x i32> zeroinitializer
   %0 = add i32 %n.vec, -8
@@ -101,14 +91,8 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %index = phi i32 [ 0, %vector.ph], [ %index.next, %vector.body ]
   %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph], [ %tmp6, %vector.body ]
   %3 = phi i32 [ %start, %vector.ph], [ %4, %vector.body ]
-  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
-  %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %tmp2 = getelementptr inbounds i16, i16* %A, i32 %index
-
-  ; %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2
   %tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
-
   %tmp4 = bitcast i16* %tmp2 to <8 x i16>*
   %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef)
   %tmp5 = add <8 x i16> %vec.phi, %broadcast.splat4
@@ -151,8 +135,6 @@ entry:
   %tmp = add i32 %N, -1
   %n.rnd.up = add nuw nsw i32 %tmp, 8
   %n.vec = and i32 %n.rnd.up, -8
-  %broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %tmp, i32 0
-  %broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer
   %broadcast.splatinsert3 = insertelement <8 x i16> undef, i16 %B, i32 0
   %broadcast.splat4 = shufflevector <8 x i16> %broadcast.splatinsert3, <8 x i16> undef, <8 x i32> zeroinitializer
   %0 = add i32 %n.vec, -8
@@ -165,14 +147,8 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %index = phi i32 [ 0, %entry], [ %index.next, %vector.body ]
   %vec.phi = phi <8 x i16> [ zeroinitializer, %entry], [ %tmp6, %vector.body ]
   %3 = phi i32 [ %start, %entry ], [ %4, %vector.body ]
-  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
-  %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %tmp2 = getelementptr inbounds i16, i16* %A, i32 %index
-
-  ; %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2
   %tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
-
   %tmp4 = bitcast i16* %tmp2 to <8 x i16>*
   %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef)
   %tmp5 = add <8 x i16> %vec.phi, %broadcast.splat4
@@ -227,7 +203,6 @@ for.body:
   br i1 %cmp433, label %vector.ph, label %for.end
 
 vector.ph:                                        ; preds = %for.body
-  %trip.count.minus.1 = add i32 %8, -1
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %7)
   br label %vector.body
 
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
index 939d3cc5e558..c9b2905755ed 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
@@ -14,23 +14,14 @@ entry:
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 
 vector.ph:                                        ; preds = %entry
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
-  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
-  %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %tmp = getelementptr inbounds i16, i16* %a, i32 %index
-
-  ; %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11
-   %tmp1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
-
+  %tmp1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
   %tmp2 = bitcast i16* %tmp to <8 x i16>*
   %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %tmp1, <8 x i16> undef)
   %tmp3 = getelementptr inbounds i16, i16* %b, i32 %index
@@ -72,8 +63,6 @@ entry:
 
 vector.ph:                                        ; preds = %entry
   %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
   %broadcast.splatinsert10.store = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
   %broadcast.splat11.store = shufflevector <4 x i32> %broadcast.splatinsert10.store, <4 x i32> undef, <4 x i32> zeroinitializer
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
@@ -83,14 +72,8 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %store.idx = phi i32 [ 0, %vector.ph ], [ %store.idx.next, %vector.body ]
   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
-  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
-  %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %tmp = getelementptr inbounds i16, i16* %a, i32 %index
-
-  ; %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11
   %tmp1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
-
   %tmp2 = bitcast i16* %tmp to <8 x i16>*
   %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %tmp1, <8 x i16> undef)
   %tmp3 = getelementptr inbounds i16, i16* %b, i32 %index
@@ -136,23 +119,14 @@ entry:
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 
 vector.ph:                                        ; preds = %entry
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
-
-  ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %tmp2 = bitcast i32* %tmp to <4 x i32>*
   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
index 1ea183d4a5ff..af5c76fd4477 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
@@ -183,7 +183,6 @@ for.body:                                         ; preds = %for.end, %for.body.
   br i1 %cmp433, label %vector.ph, label %for.end
 
 vector.ph:                                        ; preds = %for.body
-  %trip.count.minus.1 = add i32 %i8, -1
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %i7)
   br label %vector.body
 
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
index 6b4113ea4bb5..02f65d240c06 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
@@ -11,27 +11,25 @@ define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* no
 ; CHECK-NEXT:  .LBB0_1: @ %vector.ph
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    adds r3, r2, #3
-; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    sub.w r12, r3, #4
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
-; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB0_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
-; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrwt.u32 q0, [r0], #16
+; CHECK-NEXT:    vldrwt.u32 q1, [r0], #16
 ; CHECK-NEXT:    vldrwt.u32 q2, [r1], #16
-; CHECK-NEXT:    adds r3, #4
-; CHECK-NEXT:    vmul.i32 q0, q2, q0
 ; CHECK-NEXT:    subs r2, #4
-; CHECK-NEXT:    vadd.i32 q0, q0, q1
+; CHECK-NEXT:    vmul.i32 q1, q2, q1
+; CHECK-NEXT:    vadd.i32 q1, q1, q0
 ; CHECK-NEXT:    le lr, .LBB0_2
 ; CHECK-NEXT:  @ %bb.3: @ %middle.block
-; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    vpsel q0, q1, q0
 ; CHECK-NEXT:    vaddv.u32 r0, q0
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
@@ -41,22 +39,13 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %6, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds i32, i32* %a, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat12
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i32* %0 to <4 x i32>*
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
   %3 = getelementptr inbounds i32, i32* %b, i32 %index
@@ -93,7 +82,6 @@ define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    subs r1, #4
 ; CHECK-NEXT:    add.w lr, r3, r1, lsr #2
-; CHECK-NEXT:    movs r1, #0
 ; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
@@ -101,7 +89,6 @@ define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vldrwt.u32 q0, [r0], #16
-; CHECK-NEXT:    adds r1, #4
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vadd.i32 q0, q0, q1
 ; CHECK-NEXT:    le lr, .LBB1_2
@@ -116,22 +103,13 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds i32, i32* %a, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i32* %0 to <4 x i32>*
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
   %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi
@@ -164,7 +142,6 @@ define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    subs r1, #4
 ; CHECK-NEXT:    add.w lr, r3, r1, lsr #2
-; CHECK-NEXT:    movs r1, #0
 ; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB2_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
@@ -172,7 +149,6 @@ define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vldrwt.u32 q0, [r0], #16
-; CHECK-NEXT:    adds r1, #4
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vadd.i32 q0, q0, q1
 ; CHECK-NEXT:    le lr, .LBB2_2
@@ -187,22 +163,13 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds i32, i32* %a, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i32* %0 to <4 x i32>*
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
   %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi
@@ -228,11 +195,9 @@ define dso_local void @vector_mul_const(i32* noalias nocapture %a, i32* noalias
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB3_1: @ %vector.ph
-; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB3_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add.w r12, r12, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
 ; CHECK-NEXT:    vmul.i32 q0, q0, r2
 ; CHECK-NEXT:    vstrw.32 q0, [r0], #16
@@ -246,23 +211,14 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds i32, i32* %b, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i32* %0 to <4 x i32>*
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
   %3 = mul nsw <4 x i32> %wide.masked.load, %broadcast.splat11
@@ -285,11 +241,9 @@ define dso_local void @vector_add_const(i32* noalias nocapture %a, i32* noalias
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB4_1: @ %vector.ph
-; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB4_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add.w r12, r12, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
 ; CHECK-NEXT:    vadd.i32 q0, q0, r2
 ; CHECK-NEXT:    vstrw.32 q0, [r0], #16
@@ -303,23 +257,14 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0
   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds i32, i32* %b, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i32* %0 to <4 x i32>*
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
   %3 = add nsw <4 x i32> %wide.masked.load, %broadcast.splat11
@@ -342,11 +287,9 @@ define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i8(i8* noalias nocaptur
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB5_1: @ %vector.ph
-; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.8 lr, r3
 ; CHECK-NEXT:  .LBB5_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add.w r12, r12, #16
 ; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
 ; CHECK-NEXT:    vldrb.u8 q1, [r2], #16
 ; CHECK-NEXT:    vmul.i8 q0, q1, q0
@@ -361,21 +304,12 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 15
   %n.vec = and i32 %n.rnd.up, -16
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert12 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat13 = shufflevector <16 x i32> %broadcast.splatinsert12, <16 x i32> undef, <16 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer
-  %induction = add <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %0 = getelementptr inbounds i8, i8* %b, i32 %index
-
-  ; %1 = icmp ule <16 x i32> %induction, %broadcast.splat13
   %1 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i8* %0 to <16 x i8>*
   %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %2, i32 1, <16 x i1> %1, <16 x i8> undef)
   %3 = getelementptr inbounds i8, i8* %c, i32 %index
@@ -402,11 +336,9 @@ define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i16(i16* noalias nocapt
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
 ; CHECK-NEXT:  .LBB6_1: @ %vector.ph
-; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.16 lr, r3
 ; CHECK-NEXT:  .LBB6_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add.w r12, r12, #8
 ; CHECK-NEXT:    vldrh.u16 q0, [r1], #16
 ; CHECK-NEXT:    vldrh.u16 q1, [r2], #16
 ; CHECK-NEXT:    vmul.i16 q0, q1, q0
@@ -421,21 +353,12 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %N, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert12 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat13 = shufflevector <8 x i32> %broadcast.splatinsert12, <8 x i32> undef, <8 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
-  %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %0 = getelementptr inbounds i16, i16* %b, i32 %index
-
-  ; %1 = icmp ule <8 x i32> %induction, %broadcast.splat13
   %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
-
   %2 = bitcast i16* %0 to <8 x i16>*
   %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %2, i32 2, <8 x i1> %1, <8 x i16> undef)
   %3 = getelementptr inbounds i16, i16* %c, i32 %index
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
index e8da32611be2..ec6a7554b3e6 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
@@ -30,9 +30,6 @@ entry:
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 
 vector.ph:                                        ; preds = %entry
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
   %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
   br label %vector.body
 
@@ -44,13 +41,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %6 = phi i32 [ %start, %vector.ph ], [ %10, %vector.body ]
   %lsr.iv24 = bitcast i32* %lsr.iv2 to <4 x i32>*
   %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>*
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
-
-  ; %7 = icmp ule <4 x i32> %induction, %broadcast.splat12
   %7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv24, i32 4, <4 x i1> %7, <4 x i32> undef)
   %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1, i32 4, <4 x i1> %7, <4 x i32> undef)
   %8 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load
diff --git a/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll b/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll
index 7609c16ea84c..a34a278103a0 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll
@@ -11,11 +11,9 @@ define arm_aapcs_vfpcc void @fmas1(float* nocapture readonly %x, float* nocaptur
 ; CHECK-NEXT:    poplt {r4, pc}
 ; CHECK-NEXT:  .LBB0_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r12, s0
-; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB0_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r4, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
 ; CHECK-NEXT:    vfmas.f32 q1, q0, r12
@@ -30,23 +28,14 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
   %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds float, float* %x, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
-
   %2 = bitcast float* %0 to <4 x float>*
   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
   %3 = getelementptr inbounds float, float* %y, i32 %index
@@ -74,11 +63,9 @@ define arm_aapcs_vfpcc void @fmas2(float* nocapture readonly %x, float* nocaptur
 ; CHECK-NEXT:    poplt {r4, pc}
 ; CHECK-NEXT:  .LBB1_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r12, s0
-; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r4, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
 ; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
 ; CHECK-NEXT:    vfmas.f32 q1, q0, r12
@@ -93,23 +80,14 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
   %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds float, float* %x, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
-
   %2 = bitcast float* %0 to <4 x float>*
   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
   %3 = getelementptr inbounds float, float* %y, i32 %index
@@ -138,11 +116,9 @@ define arm_aapcs_vfpcc void @fma1(float* nocapture readonly %x, float* nocapture
 ; CHECK-NEXT:    poplt {r4, pc}
 ; CHECK-NEXT:  .LBB2_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r12, s0
-; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB2_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r4, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
 ; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
 ; CHECK-NEXT:    vfma.f32 q1, q0, r12
@@ -157,23 +133,14 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
   %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds float, float* %x, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
-
   %2 = bitcast float* %0 to <4 x float>*
   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
   %3 = getelementptr inbounds float, float* %y, i32 %index
@@ -201,11 +168,9 @@ define arm_aapcs_vfpcc void @fma2(float* nocapture readonly %x, float* nocapture
 ; CHECK-NEXT:    poplt {r4, pc}
 ; CHECK-NEXT:  .LBB3_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r12, s0
-; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB3_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r4, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
 ; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
 ; CHECK-NEXT:    vfma.f32 q1, q0, r12
@@ -220,23 +185,14 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert12 = insertelement <4 x float> undef, float %a, i32 0
   %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds float, float* %x, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
-
   %2 = bitcast float* %0 to <4 x float>*
   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
   %3 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat13
@@ -265,12 +221,10 @@ define arm_aapcs_vfpcc void @fmss1(float* nocapture readonly %x, float* nocaptur
 ; CHECK-NEXT:    poplt {r4, pc}
 ; CHECK-NEXT:  .LBB4_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r12, s0
-; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    eor r12, r12, #-2147483648
 ; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB4_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r4, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
 ; CHECK-NEXT:    vfmas.f32 q1, q0, r12
@@ -286,23 +240,14 @@ vector.ph:                                        ; preds = %entry
   %fneg = fneg fast float %a
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert13 = insertelement <4 x float> undef, float %fneg, i32 0
   %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds float, float* %x, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
-
   %2 = bitcast float* %0 to <4 x float>*
   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
   %3 = getelementptr inbounds float, float* %y, i32 %index
@@ -329,14 +274,12 @@ define arm_aapcs_vfpcc void @fmss2(float* nocapture readonly %x, float* nocaptur
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    poplt {r4, pc}
 ; CHECK-NEXT:  .LBB5_1: @ %vector.ph
-; CHECK-NEXT:    vmov lr, s0
-; CHECK-NEXT:    vdup.32 q0, lr
+; CHECK-NEXT:    vmov r12, s0
+; CHECK-NEXT:    vdup.32 q0, r12
 ; CHECK-NEXT:    vneg.f32 q0, q0
-; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB5_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add.w r12, r12, #4
 ; CHECK-NEXT:    vmov q3, q0
 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
 ; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
@@ -352,23 +295,14 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
   %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds float, float* %x, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
-
   %2 = bitcast float* %0 to <4 x float>*
   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
   %3 = getelementptr inbounds float, float* %y, i32 %index
@@ -398,11 +332,9 @@ define arm_aapcs_vfpcc void @fmss3(float* nocapture readonly %x, float* nocaptur
 ; CHECK-NEXT:  .LBB6_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r4, s0
 ; CHECK-NEXT:    vdup.32 q0, r4
-; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB6_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add.w r12, r12, #4
 ; CHECK-NEXT:    vmov q3, q0
 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
 ; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
@@ -418,23 +350,14 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
   %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds float, float* %x, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
-
   %2 = bitcast float* %0 to <4 x float>*
   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
   %3 = getelementptr inbounds float, float* %y, i32 %index
@@ -464,11 +387,9 @@ define arm_aapcs_vfpcc void @fmss4(float* nocapture readonly %x, float* nocaptur
 ; CHECK-NEXT:  .LBB7_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r4, s0
 ; CHECK-NEXT:    vdup.32 q0, r4
-; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB7_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add.w r12, r12, #4
 ; CHECK-NEXT:    vmov q3, q0
 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
 ; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
@@ -484,23 +405,14 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
   %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds float, float* %x, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
-
   %2 = bitcast float* %0 to <4 x float>*
   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
   %3 = getelementptr inbounds float, float* %y, i32 %index
@@ -529,12 +441,10 @@ define arm_aapcs_vfpcc void @fms1(float* nocapture readonly %x, float* nocapture
 ; CHECK-NEXT:    poplt {r4, pc}
 ; CHECK-NEXT:  .LBB8_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r12, s0
-; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    eor r12, r12, #-2147483648
 ; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB8_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r4, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
 ; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
 ; CHECK-NEXT:    vfma.f32 q1, q0, r12
@@ -550,23 +460,14 @@ vector.ph:                                        ; preds = %entry
   %fneg = fneg fast float %a
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert13 = insertelement <4 x float> undef, float %fneg, i32 0
   %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds float, float* %x, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
-
   %2 = bitcast float* %0 to <4 x float>*
   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
   %3 = getelementptr inbounds float, float* %y, i32 %index
@@ -595,11 +496,9 @@ define arm_aapcs_vfpcc void @fms2(float* nocapture readonly %x, float* nocapture
 ; CHECK-NEXT:  .LBB9_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r4, s0
 ; CHECK-NEXT:    vdup.32 q0, r4
-; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB9_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add.w r12, r12, #4
 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
 ; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
 ; CHECK-NEXT:    vfms.f32 q2, q1, q0
@@ -614,23 +513,14 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
   %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds float, float* %x, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
-
   %2 = bitcast float* %0 to <4 x float>*
   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
   %3 = getelementptr inbounds float, float* %y, i32 %index
@@ -659,16 +549,14 @@ define arm_aapcs_vfpcc void @fms3(float* nocapture readonly %x, float* nocapture
 ; CHECK-NEXT:    poplt {r4, pc}
 ; CHECK-NEXT:  .LBB10_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r12, s0
-; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB10_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
-; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
-; CHECK-NEXT:    adds r4, #4
-; CHECK-NEXT:    vneg.f32 q1, q1
-; CHECK-NEXT:    vfma.f32 q1, q0, r12
-; CHECK-NEXT:    vstrw.32 q1, [r2], #16
+; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    vneg.f32 q0, q0
+; CHECK-NEXT:    vfma.f32 q0, q1, r12
+; CHECK-NEXT:    vstrw.32 q0, [r2], #16
 ; CHECK-NEXT:    letp lr, .LBB10_2
 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r4, pc}
@@ -679,23 +567,14 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
   %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds float, float* %x, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
-
   %2 = bitcast float* %0 to <4 x float>*
   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
   %3 = getelementptr inbounds float, float* %y, i32 %index
@@ -724,16 +603,14 @@ define arm_aapcs_vfpcc void @fms4(float* nocapture readonly %x, float* nocapture
 ; CHECK-NEXT:    poplt {r4, pc}
 ; CHECK-NEXT:  .LBB11_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r12, s0
-; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB11_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
-; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
-; CHECK-NEXT:    adds r4, #4
-; CHECK-NEXT:    vneg.f32 q1, q1
-; CHECK-NEXT:    vfma.f32 q1, q0, r12
-; CHECK-NEXT:    vstrw.32 q1, [r2], #16
+; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    vneg.f32 q0, q0
+; CHECK-NEXT:    vfma.f32 q0, q1, r12
+; CHECK-NEXT:    vstrw.32 q0, [r2], #16
 ; CHECK-NEXT:    letp lr, .LBB11_2
 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r4, pc}
@@ -744,23 +621,14 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
-  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
   %broadcast.splatinsert12 = insertelement <4 x float> undef, float %a, i32 0
   %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %0 = getelementptr inbounds float, float* %x, i32 %index
-
-  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
-
   %2 = bitcast float* %0 to <4 x float>*
   %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
   %3 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat13
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
index c4f68959ecf4..a2d1f2a9db68 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
@@ -244,7 +244,6 @@ define arm_aapcs_vfpcc void @gather_pre_inc(i32* noalias nocapture readonly %dat
 ; CHECK-NEXT:    .long 4294967272 @ 0xffffffe8
 ; CHECK-NEXT:    .long 0 @ 0x0
 vector.ph:                                        ; preds = %for.body.preheader
-  %ind.end = shl i32 %n.vec, 1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -288,7 +287,6 @@ define arm_aapcs_vfpcc void @gather_post_inc(i32* noalias nocapture readonly %da
 ; CHECK-NEXT:    .long 4294967248 @ 0xffffffd0
 ; CHECK-NEXT:    .long 4294967272 @ 0xffffffe8
 vector.ph41:                                      ; preds = %for.body6.preheader
-  %ind.end47 = shl i32 %n.vec43, 1
   br label %vector.body39
 
 vector.body39:                                    ; preds = %vector.body39, %vector.ph41
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll b/llvm/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll
index 4e971542bf75..1617ce36d4ca 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll
@@ -1,18 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-
-
-; RUN: opt --arm-mve-gather-scatter-lowering -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -S -o 2>/dev/null - | FileCheck %s
+; RUN: opt --arm-mve-gather-scatter-lowering -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -S -o - | FileCheck %s
 
 define arm_aapcs_vfpcc void @push_out_add_sub_block(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
 ; CHECK-LABEL: @push_out_add_sub_block(
 ; CHECK-NEXT:  vector.ph:
-; CHECK-NEXT:    [[IND_END:%.*]] = shl i32 [[N_VEC:%.*]], 1
 ; CHECK-NEXT:    [[PUSHEDOUTADD:%.*]] = add <4 x i32> <i32 0, i32 2, i32 4, i32 6>, <i32 6, i32 6, i32 6, i32 6>
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY_END:%.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[PUSHEDOUTADD]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY_END]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX]], 50
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX]], 48
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[LOWER_BLOCK:%.*]], label [[END:%.*]]
 ; CHECK:       lower.block:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[DATA:%.*]], <4 x i32> [[VEC_IND]], i32 32, i32 2, i32 1)
@@ -23,20 +20,19 @@ define arm_aapcs_vfpcc void @push_out_add_sub_block(i32* noalias nocapture reado
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
 ; CHECK-NEXT:    br label [[VECTOR_BODY_END]]
 ; CHECK:       vector.body.end:
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC:%.*]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[END]], label [[VECTOR_BODY]]
 ; CHECK:       end:
 ; CHECK-NEXT:    ret void
 ;
 
 vector.ph:
-  %ind.end = shl i32 %n.vec, 1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body.end ]
   %vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body.end ]
-  %0 = icmp eq i32 %index, 50
+  %0 = icmp eq i32 %index, 48
   br i1 %0, label %lower.block, label %end
 
 lower.block:                             ; preds = %vector.body
@@ -61,7 +57,6 @@ end:
 define arm_aapcs_vfpcc void @push_out_mul_sub_block(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
 ; CHECK-LABEL: @push_out_mul_sub_block(
 ; CHECK-NEXT:  vector.ph:
-; CHECK-NEXT:    [[IND_END:%.*]] = shl i32 [[N_VEC:%.*]], 1
 ; CHECK-NEXT:    [[PUSHEDOUTMUL:%.*]] = mul <4 x i32> <i32 0, i32 2, i32 4, i32 6>, <i32 3, i32 3, i32 3, i32 3>
 ; CHECK-NEXT:    [[PRODUCT:%.*]] = mul <4 x i32> <i32 8, i32 8, i32 8, i32 8>, <i32 3, i32 3, i32 3, i32 3>
 ; CHECK-NEXT:    [[PUSHEDOUTADD:%.*]] = add <4 x i32> [[PUSHEDOUTMUL]], <i32 6, i32 6, i32 6, i32 6>
@@ -69,7 +64,7 @@ define arm_aapcs_vfpcc void @push_out_mul_sub_block(i32* noalias nocapture reado
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY_END:%.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[PUSHEDOUTADD]], [[VECTOR_PH]] ], [ [[INCREMENTPUSHEDOUTMUL:%.*]], [[VECTOR_BODY_END]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX]], 50
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX]], 48
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[LOWER_BLOCK:%.*]], label [[END:%.*]]
 ; CHECK:       lower.block:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[DATA:%.*]], <4 x i32> [[VEC_IND]], i32 32, i32 2, i32 1)
@@ -80,20 +75,19 @@ define arm_aapcs_vfpcc void @push_out_mul_sub_block(i32* noalias nocapture reado
 ; CHECK-NEXT:    br label [[VECTOR_BODY_END]]
 ; CHECK:       vector.body.end:
 ; CHECK-NEXT:    [[INCREMENTPUSHEDOUTMUL]] = add <4 x i32> [[VEC_IND]], [[PRODUCT]]
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC:%.*]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[END]], label [[VECTOR_BODY]]
 ; CHECK:       end:
 ; CHECK-NEXT:    ret void
 ;
 
 vector.ph:
-  %ind.end = shl i32 %n.vec, 1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body.end ]
   %vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body.end ]
-  %0 = icmp eq i32 %index, 50
+  %0 = icmp eq i32 %index, 48
   br i1 %0, label %lower.block, label %end
 
 lower.block:                             ; preds = %vector.body
@@ -120,7 +114,6 @@ end:
 define arm_aapcs_vfpcc void @push_out_mul_sub_loop(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
 ; CHECK-LABEL: @push_out_mul_sub_loop(
 ; CHECK-NEXT:  vector.ph:
-; CHECK-NEXT:    [[IND_END:%.*]] = shl i32 [[N_VEC:%.*]], 2
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY_END:%.*]] ]
@@ -138,19 +131,18 @@ define arm_aapcs_vfpcc void @push_out_mul_sub_loop(i32* noalias nocapture readon
 ; CHECK-NEXT:    br label [[VECTOR_2_BODY_END:%.*]]
 ; CHECK:       vector.2.body.end:
 ; CHECK-NEXT:    [[INDEX_2_NEXT:%.*]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_2_NEXT]], 15
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_2_NEXT]], 16
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[VECTOR_BODY_END]], label [[VECTOR_2_BODY]]
 ; CHECK:       vector.body.end:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC:%.*]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[END:%.*]], label [[VECTOR_BODY]]
 ; CHECK:       end:
 ; CHECK-NEXT:    ret void
 ;
 
 vector.ph:
-  %ind.end = shl i32 %n.vec, 2
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -162,7 +154,6 @@ vector.2.ph:
   br label %vector.2.body
 
 vector.2.body:                             ; preds = %vector.body
-  %index.2 = phi i32 [ 0, %vector.2.ph ], [ %index.2.next, %vector.2.body.end ]
   %0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
   %1 = add <4 x i32> %0, <i32 6, i32 6, i32 6, i32 6>
   %2 = getelementptr inbounds i32, i32* %data, <4 x i32> %1
@@ -174,7 +165,7 @@ vector.2.body:                             ; preds = %vector.body
 
 vector.2.body.end:                             ; preds = %lower.block
   %index.2.next = add i32 %index, 4
-  %5 = icmp eq i32 %index.2.next, 15
+  %5 = icmp eq i32 %index.2.next, 16
   br i1 %5, label %vector.body.end, label %vector.2.body
 
 vector.body.end:                             ; preds = %lower.block
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
index cfed9ccaebae..15c5291ca36d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
@@ -39,7 +39,6 @@ define arm_aapcs_vfpcc void @push_out_mul_gather(i32* noalias nocapture readonly
 ; CHECK-NEXT:    .long 4294967272 @ 0xffffffe8
 
 vector.ph:                                        ; preds = %for.body.preheader
-  %ind.end = shl i32 %n.vec, 1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -83,7 +82,6 @@ define arm_aapcs_vfpcc void @push_out_add_gather(i32* noalias nocapture readonly
 ; CHECK-NEXT:    .long 16 @ 0x10
 
 vector.ph:                                        ; preds = %for.body.preheader
-  %ind.end = shl i32 %n.vec, 1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -127,7 +125,6 @@ define arm_aapcs_vfpcc void @push_out_mul_add_gather(i32* noalias nocapture read
 ; CHECK-NEXT:    .long 0 @ 0x0
 
 vector.ph:                                        ; preds = %for.body.preheader
-  %ind.end = shl i32 %n.vec, 1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -173,7 +170,6 @@ define arm_aapcs_vfpcc void @push_out_mul_scatter(i32* noalias nocapture readonl
                                                   <4 x i32> %to.store) {
 
 vector.ph:                                        ; preds = %for.body.preheader
-  %ind.end = shl i32 %n.vec, 1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -215,7 +211,6 @@ define arm_aapcs_vfpcc void @push_out_add_scatter(i32* noalias nocapture readonl
                                                   <4 x i32> %to.store) {
 
 vector.ph:                                        ; preds = %for.body.preheader
-  %ind.end = shl i32 %n.vec, 1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -259,7 +254,6 @@ define arm_aapcs_vfpcc void @push_out_mul_gather_scatter(i32* noalias nocapture
                                                          i32* noalias nocapture %dst, i32 %n.vec) {
 
 vector.ph:                                        ; preds = %for.body.preheader
-  %ind.end = shl i32 %n.vec, 1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -301,7 +295,6 @@ define arm_aapcs_vfpcc void @push_out_add_sub_block(i32* noalias nocapture reado
 ; CHECK-NEXT:    .long 16 @ 0x10
 
 vector.ph:                                        ; preds = %for.body.preheader
-  %ind.end = shl i32 %n.vec, 1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -358,7 +351,6 @@ define arm_aapcs_vfpcc void @non_gatscat_use1(i32* noalias nocapture readonly %d
 ; CHECK-NEXT:    .long 6 @ 0x6
 
 vector.ph:                                        ; preds = %for.body.preheader
-  %ind.end = shl i32 %n.vec, 1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -411,7 +403,6 @@ define arm_aapcs_vfpcc void @non_gatscat_use2(i32* noalias nocapture readonly %d
 ; CHECK-NEXT:    .long 6 @ 0x6
 
 vector.ph:                                        ; preds = %for.body.preheader
-  %ind.end = shl i32 %n.vec, 1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -528,7 +519,6 @@ for.cond8.preheader.us.us.preheader.preheader:    ; preds = %entry
   %2 = add nuw i32 %1, 1
   %min.iters.check = icmp ult i32 %0, 6
   %n.vec = and i32 %2, -4
-  %ind.end = shl i32 %n.vec, 1
   %broadcast.splatinsert86 = insertelement <4 x i32> undef, i32 %m, i32 0
   %broadcast.splat87 = shufflevector <4 x i32> %broadcast.splatinsert86, <4 x i32> undef, <4 x i32> zeroinitializer
   %cmp.n = icmp eq i32 %2, %n.vec
@@ -978,7 +968,6 @@ for.body10.i:                                     ; preds = %for.cond.cleanup20.
   br i1 0, label %for.cond.cleanup20.i, label %for.cond22.preheader.lr.ph.i
 
 for.cond22.preheader.lr.ph.i:                     ; preds = %for.body10.i
-  %ind.end = add nsw i32 0, %n.vec
   %.splatinsert = insertelement <4 x i32> undef, i32 0, i32 0
   %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %.splat, <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
index 728328ac9cba..8ae094fb66e6 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
@@ -1711,7 +1711,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -1762,7 +1761,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -1816,7 +1814,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -1868,7 +1865,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -1924,7 +1920,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 15
   %n.vec = and i32 %n.rnd.up, -16
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -1976,7 +1971,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 15
   %n.vec = and i32 %n.rnd.up, -16
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -2032,7 +2026,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -2083,7 +2076,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -2137,7 +2129,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 15
   %n.vec = and i32 %n.rnd.up, -16
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -2189,7 +2180,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 15
   %n.vec = and i32 %n.rnd.up, -16
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -2245,7 +2235,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 15
   %n.vec = and i32 %n.rnd.up, -16
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -2296,7 +2285,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 15
   %n.vec = and i32 %n.rnd.up, -16
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -2352,7 +2340,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -2406,7 +2393,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 3
   %n.vec = and i32 %n.rnd.up, -4
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -2465,7 +2451,6 @@ entry:
 vector.ph:                                        ; preds = %entry
   %n.rnd.up = add i32 %n, 7
   %n.vec = and i32 %n.rnd.up, -8
-  %trip.count.minus.1 = add i32 %n, -1
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph

From f83afe6ae9613512bd8f30090d8c287292b55dcd Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 10 Jun 2021 20:18:12 +0100
Subject: [PATCH 315/318] [ARM] Ensure instructions are simplified prior to
 GatherScatter lowering.

Surprisingly, not all instructions are always simplified after unrolling
and before MVE gather/scatter lowering. Notably dead gather operations
can be left around which cause the gather/scatter lowering pass to crash
if there are multiple gathers, some of which are dead.

This patch ensures they are simplified before we modify anything, which
can change some of the existing tests, including making them no-longer
test what they originally tested. This uses a combination of disabling
the gather/scatter lowering pass and adjusting the test to keep them as
before.

Differential Revision: https://reviews.llvm.org/D103150
---
 .../Target/ARM/MVEGatherScatterLowering.cpp   |  2 +
 .../Thumb2/LowOverheadLoops/remat-vctp.ll     |  2 +-
 llvm/test/CodeGen/Thumb2/lsll0.ll             |  2 +-
 .../Thumb2/mve-gather-scatter-optimisation.ll | 74 +++++++++++--------
 llvm/test/CodeGen/Thumb2/mve-gather-unused.ll | 38 ++++++++++
 llvm/test/CodeGen/Thumb2/mve-phireg.ll        |  2 +-
 llvm/test/CodeGen/Thumb2/mve-pred-xor.ll      |  4 +-
 llvm/test/CodeGen/Thumb2/mve-selectcc.ll      |  2 +-
 llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll       | 22 ++----
 9 files changed, 96 insertions(+), 52 deletions(-)
 create mode 100644 llvm/test/CodeGen/Thumb2/mve-gather-unused.ll

diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
index 039f6f2053d8..195622cfd586 100644
--- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
+++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
@@ -1166,6 +1166,8 @@ bool MVEGatherScatterLowering::runOnFunction(Function &F) {
   bool Changed = false;
 
   for (BasicBlock &BB : F) {
+    SimplifyInstructionsInBlock(&BB);
+
     for (Instruction &I : BB) {
       IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
       if (II && II->getIntrinsicID() == Intrinsic::masked_gather &&
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll
index f334a5950acb..e809eea1e0c2 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -enable-arm-maskedgatscat=false %s -o - | FileCheck %s
 
 define void @remat_vctp(i32* %arg, i32* %arg1, i32* %arg2, i32* %arg3, i32* %arg4, i16 zeroext %arg5) {
 ; CHECK-LABEL: remat_vctp:
diff --git a/llvm/test/CodeGen/Thumb2/lsll0.ll b/llvm/test/CodeGen/Thumb2/lsll0.ll
index 30d2edecbd28..4e9735be077a 100644
--- a/llvm/test/CodeGen/Thumb2/lsll0.ll
+++ b/llvm/test/CodeGen/Thumb2/lsll0.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -enable-arm-maskedgatscat=false -verify-machineinstrs %s -o - | FileCheck %s
 
 define void @_Z4loopPxS_iS_i(i64* %d) {
 ; CHECK-LABEL: _Z4loopPxS_iS_i:
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
index 15c5291ca36d..672fa16fc4c1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
@@ -321,26 +321,29 @@ end:
   ret void;
 }
 
-define arm_aapcs_vfpcc void @non_gatscat_use1(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
+define arm_aapcs_vfpcc void @non_gatscat_use1(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec, <4 x i32>* %x) {
 ; CHECK-LABEL: non_gatscat_use1:
 ; CHECK:       @ %bb.0: @ %vector.ph
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    adr r3, .LCPI7_0
-; CHECK-NEXT:    vmov.i32 q0, #0x8
-; CHECK-NEXT:    vldrw.u32 q2, [r3]
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    adr.w r12, .LCPI7_0
+; CHECK-NEXT:    vmov.i32 q0, #0x9
+; CHECK-NEXT:    vldrw.u32 q3, [r12]
 ; CHECK-NEXT:    vmov.i32 q1, #0xc
+; CHECK-NEXT:    vmov.i32 q2, #0x8
 ; CHECK-NEXT:  .LBB7_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vadd.i32 q3, q2, q0
-; CHECK-NEXT:    vmlas.u32 q2, q1, r0
-; CHECK-NEXT:    vldrw.u32 q4, [q2, #24]
+; CHECK-NEXT:    vadd.i32 q4, q3, q2
+; CHECK-NEXT:    vmul.i32 q5, q3, q0
+; CHECK-NEXT:    vmlas.u32 q3, q1, r0
 ; CHECK-NEXT:    subs r2, #4
-; CHECK-NEXT:    vmov q2, q3
-; CHECK-NEXT:    vstrb.8 q4, [r1], #16
+; CHECK-NEXT:    vldrw.u32 q6, [q3, #24]
+; CHECK-NEXT:    vmov q3, q4
+; CHECK-NEXT:    vstrw.32 q5, [r3]
+; CHECK-NEXT:    vstrb.8 q6, [r1], #16
 ; CHECK-NEXT:    bne .LBB7_1
 ; CHECK-NEXT:  @ %bb.2: @ %end
-; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.3:
@@ -364,6 +367,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %4 = bitcast i32* %3 to <4 x i32>*
   store <4 x i32> %wide.masked.gather, <4 x i32>* %4, align 4
   %non_gatscat_use = mul <4 x i32> %0, <i32 3, i32 3, i32 3, i32 3>
+  store <4 x i32> %non_gatscat_use, <4 x i32>* %x, align 4
   %index.next = add i32 %index, 4
   %vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
   %5 = icmp eq i32 %index.next, %n.vec
@@ -373,26 +377,31 @@ end:
   ret void;
 }
 
-define arm_aapcs_vfpcc void @non_gatscat_use2(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
+define arm_aapcs_vfpcc void @non_gatscat_use2(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec, <4 x i32>* %x) {
 ; CHECK-LABEL: non_gatscat_use2:
 ; CHECK:       @ %bb.0: @ %vector.ph
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    adr r3, .LCPI8_0
-; CHECK-NEXT:    vmov.i32 q0, #0x8
-; CHECK-NEXT:    vldrw.u32 q2, [r3]
-; CHECK-NEXT:    vmov.i32 q1, #0xc
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    adr.w r12, .LCPI8_0
+; CHECK-NEXT:    vmov.i32 q0, #0x12
+; CHECK-NEXT:    vldrw.u32 q4, [r12]
+; CHECK-NEXT:    vmov.i32 q1, #0x9
+; CHECK-NEXT:    vmov.i32 q2, #0x8
+; CHECK-NEXT:    vmov.i32 q3, #0xc
 ; CHECK-NEXT:  .LBB8_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vadd.i32 q3, q2, q0
-; CHECK-NEXT:    vmlas.u32 q2, q1, r0
-; CHECK-NEXT:    vldrw.u32 q4, [q2, #24]
+; CHECK-NEXT:    vadd.i32 q5, q4, q2
+; CHECK-NEXT:    vmul.i32 q6, q4, q1
+; CHECK-NEXT:    vmlas.u32 q4, q3, r0
 ; CHECK-NEXT:    subs r2, #4
-; CHECK-NEXT:    vmov q2, q3
-; CHECK-NEXT:    vstrb.8 q4, [r1], #16
+; CHECK-NEXT:    vldrw.u32 q7, [q4, #24]
+; CHECK-NEXT:    vadd.i32 q4, q6, q0
+; CHECK-NEXT:    vstrw.32 q4, [r3]
+; CHECK-NEXT:    vmov q4, q5
+; CHECK-NEXT:    vstrb.8 q7, [r1], #16
 ; CHECK-NEXT:    bne .LBB8_1
 ; CHECK-NEXT:  @ %bb.2: @ %end
-; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.3:
@@ -416,6 +425,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %4 = bitcast i32* %3 to <4 x i32>*
   store <4 x i32> %wide.masked.gather, <4 x i32>* %4, align 4
   %non_gatscat_use = mul <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
+  store <4 x i32> %non_gatscat_use, <4 x i32>* %x, align 4
   %index.next = add i32 %index, 4
   %vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
   %5 = icmp eq i32 %index.next, %n.vec
@@ -844,12 +854,12 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
 ; CHECK-NEXT:    add.w r8, r7, #10
 ; CHECK-NEXT:    adr r7, .LCPI11_0
 ; CHECK-NEXT:    ldr r1, [sp, #96]
-; CHECK-NEXT:    vdup.32 q1, r2
-; CHECK-NEXT:    vldrw.u32 q0, [r7]
+; CHECK-NEXT:    vdup.32 q0, r2
+; CHECK-NEXT:    vldrw.u32 q1, [r7]
 ; CHECK-NEXT:    mov.w r10, #0
 ; CHECK-NEXT:    mov.w r9, #6
 ; CHECK-NEXT:    movs r6, #11
-; CHECK-NEXT:    vshl.i32 q1, q1, #2
+; CHECK-NEXT:    vshl.i32 q0, q0, #2
 ; CHECK-NEXT:    movs r5, #0
 ; CHECK-NEXT:  .LBB11_1: @ %for.body10.i
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
@@ -884,10 +894,10 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
 ; CHECK-NEXT:    mul r4, r11, r6
 ; CHECK-NEXT:    vdup.32 q3, r5
 ; CHECK-NEXT:    vdup.32 q2, r7
-; CHECK-NEXT:    vadd.i32 q4, q0, r4
+; CHECK-NEXT:    vadd.i32 q4, q1, r4
 ; CHECK-NEXT:    vmla.u32 q3, q4, r2
 ; CHECK-NEXT:    adds r4, #113
-; CHECK-NEXT:    vadd.i32 q4, q0, r4
+; CHECK-NEXT:    vadd.i32 q4, q1, r4
 ; CHECK-NEXT:    mov r4, r8
 ; CHECK-NEXT:    vmla.u32 q2, q4, r2
 ; CHECK-NEXT:  .LBB11_5: @ %vector.body
@@ -897,8 +907,8 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
 ; CHECK-NEXT:    @ Parent Loop BB11_4 Depth=4
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=5
 ; CHECK-NEXT:    vldrb.s32 q6, [r0, q2]
-; CHECK-NEXT:    vadd.i32 q5, q2, q1
-; CHECK-NEXT:    vadd.i32 q4, q3, q1
+; CHECK-NEXT:    vadd.i32 q5, q2, q0
+; CHECK-NEXT:    vadd.i32 q4, q3, q0
 ; CHECK-NEXT:    subs r4, #4
 ; CHECK-NEXT:    vadd.i32 q2, q6, r2
 ; CHECK-NEXT:    vldrb.s32 q6, [r1, q3]
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-unused.ll b/llvm/test/CodeGen/Thumb2/mve-gather-unused.ll
new file mode 100644
index 000000000000..b8d732b709cb
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-unused.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedldst %s -o - | FileCheck %s
+
+; This files has some unused gathers, making sure that they do not cause
+; problems as the function gets simplified.
+
+define arm_aapcs_vfpcc void @unused1(<4 x i32*> %offs) {
+; CHECK-LABEL: unused1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    bx lr
+entry:
+  %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @unused2(<4 x i32*> %offs) {
+; CHECK-LABEL: unused2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    bx lr
+entry:
+  %gather1 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  %gather2 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @unused2_used(<4 x i32*> %offs) {
+; CHECK-LABEL: unused2_used:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    bx lr
+entry:
+  %gather1 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  %gather2 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  %unused = add <4 x i32> %gather1, %gather2
+  ret void
+}
+
+
+declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
diff --git a/llvm/test/CodeGen/Thumb2/mve-phireg.ll b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
index 252f9c6439ec..d910d1929315 100644
--- a/llvm/test/CodeGen/Thumb2/mve-phireg.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -O3 -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
+; RUN: llc -O3 -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat=false -verify-machineinstrs %s -o - | FileCheck %s
 
 ; verify-machineinstrs previously caught the incorrect use of QPR in the stack reloads.
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-xor.ll b/llvm/test/CodeGen/Thumb2/mve-pred-xor.ll
index 6c13200a2d55..5d947e86d183 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-xor.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-xor.ll
@@ -170,8 +170,8 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @cmpugez_v4i1(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: cmpugez_v4i1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    vcmp.i32 eq, q0, zr
+; CHECK-NEXT:    vpsel q0, q1, q0
 ; CHECK-NEXT:    bx lr
 entry:
   %c1 = icmp eq <4 x i32> %a, zeroinitializer
diff --git a/llvm/test/CodeGen/Thumb2/mve-selectcc.ll b/llvm/test/CodeGen/Thumb2/mve-selectcc.ll
index b4f5d8d8fa3f..9e2dc568a086 100644
--- a/llvm/test/CodeGen/Thumb2/mve-selectcc.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-selectcc.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat=false -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
 
 define arm_aapcs_vfpcc <4 x i32> @test_v4i32(i32 %x, <4 x i32> %s0, <4 x i32> %s1) {
 ; CHECK-LABEL: test_v4i32:
diff --git a/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll b/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll
index 6722aa706769..fee5fd9af694 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll
@@ -70,8 +70,6 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @vqdmulh_i16_c(<8 x i16> %s0, <8 x i16> %s1) {
 ; CHECK-LABEL: vqdmulh_i16_c:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q2, q0
 ; CHECK-NEXT:    vmov.u16 r0, q0[2]
 ; CHECK-NEXT:    vmov.u16 r1, q0[0]
@@ -87,35 +85,32 @@ define arm_aapcs_vfpcc <8 x i16> @vqdmulh_i16_c(<8 x i16> %s0, <8 x i16> %s1) {
 ; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
 ; CHECK-NEXT:    vmov.u16 r1, q2[4]
 ; CHECK-NEXT:    vmullb.s16 q0, q3, q0
-; CHECK-NEXT:    vmov.i32 q3, #0x7fff
 ; CHECK-NEXT:    vshl.i32 q0, q0, #10
 ; CHECK-NEXT:    vshr.s32 q0, q0, #10
-; CHECK-NEXT:    vshr.s32 q0, q0, #15
-; CHECK-NEXT:    vmin.s32 q4, q0, q3
-; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vshr.s32 q3, q0, #15
+; CHECK-NEXT:    vmov r0, s12
 ; CHECK-NEXT:    vmov.16 q0[0], r0
-; CHECK-NEXT:    vmov r0, s17
+; CHECK-NEXT:    vmov r0, s13
 ; CHECK-NEXT:    vmov.16 q0[1], r0
-; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    vmov r0, s14
 ; CHECK-NEXT:    vmov.16 q0[2], r0
-; CHECK-NEXT:    vmov r0, s19
+; CHECK-NEXT:    vmov r0, s15
 ; CHECK-NEXT:    vmov.16 q0[3], r0
 ; CHECK-NEXT:    vmov.u16 r0, q2[6]
-; CHECK-NEXT:    vmov q4[2], q4[0], r1, r0
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q2[7]
 ; CHECK-NEXT:    vmov.u16 r1, q2[5]
-; CHECK-NEXT:    vmov q4[3], q4[1], r1, r0
+; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[6]
 ; CHECK-NEXT:    vmov.u16 r1, q1[4]
 ; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[7]
 ; CHECK-NEXT:    vmov.u16 r1, q1[5]
 ; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT:    vmullb.s16 q1, q2, q4
+; CHECK-NEXT:    vmullb.s16 q1, q2, q3
 ; CHECK-NEXT:    vshl.i32 q1, q1, #10
 ; CHECK-NEXT:    vshr.s32 q1, q1, #10
 ; CHECK-NEXT:    vshr.s32 q1, q1, #15
-; CHECK-NEXT:    vmin.s32 q1, q1, q3
 ; CHECK-NEXT:    vmov r0, s4
 ; CHECK-NEXT:    vmov.16 q0[4], r0
 ; CHECK-NEXT:    vmov r0, s5
@@ -124,7 +119,6 @@ define arm_aapcs_vfpcc <8 x i16> @vqdmulh_i16_c(<8 x i16> %s0, <8 x i16> %s1) {
 ; CHECK-NEXT:    vmov.16 q0[6], r0
 ; CHECK-NEXT:    vmov r0, s7
 ; CHECK-NEXT:    vmov.16 q0[7], r0
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %l2 = sext <8 x i16> %s0 to <8 x i22>

From d29ae443aa4028ca9cc274cd1496f7d80f34a38a Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 10 Jun 2021 21:53:04 +0100
Subject: [PATCH 316/318] [ARM] Fix Changed status in
 MVEGatherScatterLoweringPass.

Now that we are calling SimplifyInstructionsInBlock, make sure we update
Changed when it reports alterations.
---
 llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
index 195622cfd586..56823735e2d9 100644
--- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
+++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
@@ -1166,7 +1166,7 @@ bool MVEGatherScatterLowering::runOnFunction(Function &F) {
   bool Changed = false;
 
   for (BasicBlock &BB : F) {
-    SimplifyInstructionsInBlock(&BB);
+    Changed |= SimplifyInstructionsInBlock(&BB);
 
     for (Instruction &I : BB) {
       IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);

From b7c7b42db1d16c4cab595bf53c62a70a6a505e0e Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 29 Apr 2021 07:44:04 +0100
Subject: [PATCH 317/318] [ARM] Use just ARM::t2B in ARMBlockPlacementPass

The ARMConstantIsland pass will convert any t2B to tB if they are within
range after it has added or moved any constant pools. They don't need to
be deliberately converted beforehand, and it doesn't deal with needing
to convert tB to t2B very well.
---
 llvm/lib/Target/ARM/ARMBlockPlacement.cpp    | 7 ++-----
 llvm/test/CodeGen/Thumb2/block-placement.mir | 4 ++--
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMBlockPlacement.cpp b/llvm/lib/Target/ARM/ARMBlockPlacement.cpp
index 581b4b9857af..9ba16003a97a 100644
--- a/llvm/lib/Target/ARM/ARMBlockPlacement.cpp
+++ b/llvm/lib/Target/ARM/ARMBlockPlacement.cpp
@@ -145,8 +145,7 @@ bool ARMBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
              It++) {
           MachineBasicBlock *MBB = &*It;
           for (auto &Terminator : MBB->terminators()) {
-            if (Terminator.getOpcode() != ARM::t2LoopEnd &&
-                Terminator.getOpcode() != ARM::t2LoopEndDec)
+            if (Terminator.getOpcode() != ARM::t2LoopEndDec)
               continue;
             MachineBasicBlock *LETarget = Terminator.getOperand(2).getMBB();
             // The LE will become forwards branching if it branches to LoopExit
@@ -204,10 +203,8 @@ void ARMBlockPlacement::moveBasicBlock(MachineBasicBlock *BB,
     if (!Terminator.isUnconditionalBranch()) {
       // The BB doesn't have an unconditional branch so it relied on
       // fall-through. Fix by adding an unconditional branch to the moved BB.
-      unsigned BrOpc =
-          BBUtils->isBBInRange(&Terminator, To, 254) ? ARM::tB : ARM::t2B;
       MachineInstrBuilder MIB =
-          BuildMI(From, Terminator.getDebugLoc(), TII->get(BrOpc));
+          BuildMI(From, Terminator.getDebugLoc(), TII->get(ARM::t2B));
       MIB.addMBB(To);
       MIB.addImm(ARMCC::CondCodes::AL);
       MIB.addReg(ARM::NoRegister);
diff --git a/llvm/test/CodeGen/Thumb2/block-placement.mir b/llvm/test/CodeGen/Thumb2/block-placement.mir
index ed4a0a6b493d..c38223baba76 100644
--- a/llvm/test/CodeGen/Thumb2/block-placement.mir
+++ b/llvm/test/CodeGen/Thumb2/block-placement.mir
@@ -48,7 +48,7 @@ body:             |
   ; CHECK: bb.2:
   ; CHECK:   successors: %bb.3(0x80000000)
   ; CHECK:   t2WhileLoopStart killed renamable $r0, %bb.1, implicit-def dead $cpsr
-  ; CHECK:   tB %bb.3, 14 /* CC::al */, $noreg
+  ; CHECK:   t2B %bb.3, 14 /* CC::al */, $noreg
   ; CHECK: bb.1:
   ; CHECK:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
   ; CHECK: bb.3:
@@ -145,7 +145,7 @@ body:             |
   ; CHECK:   $lr = tMOVr $r0, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $r0 = t2ADDrs killed renamable $r2, killed $r0, 18, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   t2WhileLoopStart killed renamable $lr, %bb.1, implicit-def dead $cpsr
-  ; CHECK:   tB %bb.3, 14 /* CC::al */, $noreg
+  ; CHECK:   t2B %bb.3, 14 /* CC::al */, $noreg
   ; CHECK: bb.1:
   ; CHECK:   successors: %bb.4(0x80000000)
   ; CHECK:   tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr

From fed41342a82f5a3a9201819a82bf7a48313e296b Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Mon, 28 Jun 2021 09:23:38 -0700
Subject: [PATCH 318/318] Revert "Revert "[Coverage] Fix branch coverage
 merging in FunctionCoverageSummary::get() for instantiation""

This reverts commit 33d312b2d731507327252fd597bac1b738870330.

The original patch was correct, so we need to restore it in the
release branch.
---
 llvm/test/tools/llvm-cov/branch-templates.cpp | 16 +++++++++++++++-
 llvm/tools/llvm-cov/CoverageSummaryInfo.cpp   |  6 +-----
 llvm/tools/llvm-cov/CoverageSummaryInfo.h     |  5 +++++
 3 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/llvm/test/tools/llvm-cov/branch-templates.cpp b/llvm/test/tools/llvm-cov/branch-templates.cpp
index 750dc7bd58f2..4797428f8835 100644
--- a/llvm/test/tools/llvm-cov/branch-templates.cpp
+++ b/llvm/test/tools/llvm-cov/branch-templates.cpp
@@ -1,9 +1,9 @@
 // RUN: llvm-profdata merge %S/Inputs/branch-templates.proftext -o %t.profdata
 // RUN: llvm-cov show --show-expansions --show-branches=count %S/Inputs/branch-templates.o32l -instr-profile %t.profdata -path-equivalence=/tmp,%S %s | FileCheck %s
 // RUN: llvm-cov report --show-branch-summary %S/Inputs/branch-templates.o32l -instr-profile %t.profdata -show-functions -path-equivalence=/tmp,%S %s | FileCheck %s -check-prefix=REPORT
+// RUN: llvm-cov report --show-branch-summary %S/Inputs/branch-templates.o32l -instr-profile %t.profdata -path-equivalence=/tmp,%S %s | FileCheck %s -check-prefix=REPORTFILE
 
 #include <stdio.h>
-
 template<typename T>
 void unused(T x) {
   return;
@@ -45,3 +45,17 @@ int main() {
 // REPORT-NEXT: _Z4funcIfEiT_                     5       2  60.00%         7       3  57.14%         2       1  50.00%
 // REPORT-NEXT: ---
 // REPORT-NEXT: TOTAL                            22       7  68.18%        31      11  64.52%        12       6  50.00%
+
+// Make sure the covered branch tally for the function instantiation group is
+// merged to reflect maximum branch coverage of a single instantiation, just
+// like what is done for lines and regions. Also, the total branch tally
+// summary for an instantiation group should agree with the total number of
+// branches in the definition (In this case, 2 and 6 for func<>() and main(),
+// respectively).  This is returned by: FunctionCoverageSummary::get(const
+// InstantiationGroup &Group, ...)
+
+// REPORTFILE: Filename                      Regions    Missed Regions     Cover   Functions  Missed Functions  Executed       Lines      Missed Lines     Cover    Branches   Missed Branches     Cover
+// REPORTFILE-NEXT: ---
+// REPORTFILE-NEXT: branch-templates.cpp          12                 3    75.00%           2                 0   100.00%          17                 4    76.47%           8                 4    50.00%
+// REPORTFILE-NEXT: ---
+// REPORTFILE-NEXT: TOTAL                              12                 3    75.00%           2                 0   100.00%          17                 4    76.47%           8                 4    50.00%
diff --git a/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp b/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp
index 4a0a86168908..10e059adeb7d 100644
--- a/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp
+++ b/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp
@@ -100,11 +100,7 @@ FunctionCoverageSummary::get(const InstantiationGroup &Group,
   for (const auto &FCS : Summaries.drop_front()) {
     Summary.RegionCoverage.merge(FCS.RegionCoverage);
     Summary.LineCoverage.merge(FCS.LineCoverage);
-
-    // Sum branch coverage across instantiation groups for the summary rather
-    // than "merge" the maximum count. This is a clearer view into whether all
-    // created branches are covered.
-    Summary.BranchCoverage += FCS.BranchCoverage;
+    Summary.BranchCoverage.merge(FCS.BranchCoverage);
   }
   return Summary;
 }
diff --git a/llvm/tools/llvm-cov/CoverageSummaryInfo.h b/llvm/tools/llvm-cov/CoverageSummaryInfo.h
index 4bc1c24a079f..62e7cad1012b 100644
--- a/llvm/tools/llvm-cov/CoverageSummaryInfo.h
+++ b/llvm/tools/llvm-cov/CoverageSummaryInfo.h
@@ -123,6 +123,11 @@ class BranchCoverageInfo {
     return *this;
   }
 
+  void merge(const BranchCoverageInfo &RHS) {
+    Covered = std::max(Covered, RHS.Covered);
+    NumBranches = std::max(NumBranches, RHS.NumBranches);
+  }
+
   size_t getCovered() const { return Covered; }
 
   size_t getNumBranches() const { return NumBranches; }