diff --git a/.github/workflows/pr-code-format.yml b/.github/workflows/pr-code-format.yml index a4d2be092de3c2..3903fd883a12c7 100644 --- a/.github/workflows/pr-code-format.yml +++ b/.github/workflows/pr-code-format.yml @@ -54,7 +54,7 @@ jobs: - name: Install clang-format uses: aminya/setup-cpp@v1 with: - clangformat: 18.1.1 + clangformat: 18.1.7 - name: Setup Python env uses: actions/setup-python@v5 diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 69aea6c21ad393..0155c2d8fb0bef 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -268,6 +268,9 @@ Resolutions to C++ Defect Reports - Clang now requires a template argument list after a template keyword. (`CWG96: Syntactic disambiguation using the template keyword `_). +- Clang now considers ``noexcept(typeid(expr))`` more carefully, instead of always assuming that ``std::bad_typeid`` can be thrown. + (`CWG2191: Incorrect result for noexcept(typeid(v)) `_). + C Language Changes ------------------ @@ -859,6 +862,8 @@ Bug Fixes to C++ Support (#GH88081), (#GH89496), (#GH90669) and (#GH91633). - Fixed handling of brace ellison when building deduction guides. (#GH64625), (#GH83368). - Clang now instantiates local constexpr functions eagerly for constant evaluators. (#GH35052), (#GH94849) +- Fixed a failed assertion when attempting to convert an integer representing the difference + between the addresses of two labels (a GNU extension) to a pointer within a constant expression. (#GH95366). Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/AST/ExprCXX.h b/clang/include/clang/AST/ExprCXX.h index d2e8d936563595..c2feac525c1ea6 100644 --- a/clang/include/clang/AST/ExprCXX.h +++ b/clang/include/clang/AST/ExprCXX.h @@ -919,6 +919,10 @@ class CXXTypeidExpr : public Expr { reinterpret_cast(&const_cast(this)->Operand); return const_child_range(begin, begin + 1); } + + /// Whether this is of a form like "typeid(*ptr)" that can throw a + /// std::bad_typeid if a pointer is a null pointer ([expr.typeid]p2) + bool hasNullCheck() const; }; /// A member reference to an MSPropertyDecl. diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 15f62c5c1a6ab4..49b44893c25f27 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -7036,6 +7036,12 @@ def as_secure_log_file : Separate<["-"], "as-secure-log-file">, } // let Visibility = [CC1Option, CC1AsOption] +let Visibility = [CC1Option, FC1Option] in { +def mlink_builtin_bitcode : Separate<["-"], "mlink-builtin-bitcode">, + HelpText<"Link and internalize needed symbols from the given bitcode file " + "before performing optimizations.">; +} // let Visibility = [CC1Option, FC1Option] + let Visibility = [CC1Option] in { def llvm_verify_each : Flag<["-"], "llvm-verify-each">, @@ -7138,9 +7144,6 @@ defm constructor_aliases : BoolMOption<"constructor-aliases", " emitting complete constructors and destructors as aliases when possible">>; def mlink_bitcode_file : Separate<["-"], "mlink-bitcode-file">, HelpText<"Link the given bitcode file before performing optimizations.">; -def mlink_builtin_bitcode : Separate<["-"], "mlink-builtin-bitcode">, - HelpText<"Link and internalize needed symbols from the given bitcode file " - "before performing optimizations.">; defm link_builtin_bitcode_postopt: BoolMOption<"link-builtin-bitcode-postopt", CodeGenOpts<"LinkBitcodePostopt">, DefaultFalse, PosFlag(this)->isPotentiallyEvaluated(); + case CXXTypeidExprClass: { + const auto *TE = cast(this); + if (!TE->isPotentiallyEvaluated()) + return false; + + // If this type id expression can throw because of a null pointer, that is a + // side-effect independent of if the operand has a side-effect + if (IncludePossibleEffects && TE->hasNullCheck()) + return true; + + break; + } case CXXConstructExprClass: case CXXTemporaryObjectExprClass: { diff --git a/clang/lib/AST/ExprCXX.cpp b/clang/lib/AST/ExprCXX.cpp index 2abc0acbfde3bc..8d2a1b5611ccc6 100644 --- a/clang/lib/AST/ExprCXX.cpp +++ b/clang/lib/AST/ExprCXX.cpp @@ -166,6 +166,53 @@ QualType CXXTypeidExpr::getTypeOperand(ASTContext &Context) const { Operand.get()->getType().getNonReferenceType(), Quals); } +static bool isGLValueFromPointerDeref(const Expr *E) { + E = E->IgnoreParens(); + + if (const auto *CE = dyn_cast(E)) { + if (!CE->getSubExpr()->isGLValue()) + return false; + return isGLValueFromPointerDeref(CE->getSubExpr()); + } + + if (const auto *OVE = dyn_cast(E)) + return isGLValueFromPointerDeref(OVE->getSourceExpr()); + + if (const auto *BO = dyn_cast(E)) + if (BO->getOpcode() == BO_Comma) + return isGLValueFromPointerDeref(BO->getRHS()); + + if (const auto *ACO = dyn_cast(E)) + return isGLValueFromPointerDeref(ACO->getTrueExpr()) || + isGLValueFromPointerDeref(ACO->getFalseExpr()); + + // C++11 [expr.sub]p1: + // The expression E1[E2] is identical (by definition) to *((E1)+(E2)) + if (isa(E)) + return true; + + if (const auto *UO = dyn_cast(E)) + if (UO->getOpcode() == UO_Deref) + return true; + + return false; +} + +bool CXXTypeidExpr::hasNullCheck() const { + if (!isPotentiallyEvaluated()) + return false; + + // C++ [expr.typeid]p2: + // If the glvalue expression is obtained by applying the unary * operator to + // a pointer and the pointer is a null pointer value, the typeid expression + // throws the std::bad_typeid exception. + // + // However, this paragraph's intent is not clear. We choose a very generous + // interpretation which implores us to consider comma operators, conditional + // operators, parentheses and other such constructs. + return isGLValueFromPointerDeref(getExprOperand()); +} + QualType CXXUuidofExpr::getTypeOperand(ASTContext &Context) const { assert(isTypeOperand() && "Cannot call getTypeOperand for __uuidof(expr)"); Qualifiers Quals; diff --git a/clang/lib/AST/ExprConstShared.h b/clang/lib/AST/ExprConstShared.h index a97eac85abc69e..9decd47e417674 100644 --- a/clang/lib/AST/ExprConstShared.h +++ b/clang/lib/AST/ExprConstShared.h @@ -14,6 +14,9 @@ #ifndef LLVM_CLANG_LIB_AST_EXPRCONSTSHARED_H #define LLVM_CLANG_LIB_AST_EXPRCONSTSHARED_H +namespace llvm { +class APFloat; +} namespace clang { class QualType; class LangOptions; @@ -56,4 +59,8 @@ enum class GCCTypeClass { GCCTypeClass EvaluateBuiltinClassifyType(QualType T, const LangOptions &LangOpts); +void HandleComplexComplexMul(llvm::APFloat A, llvm::APFloat B, llvm::APFloat C, + llvm::APFloat D, llvm::APFloat &ResR, + llvm::APFloat &ResI); + #endif diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 7178f081d9cf35..3a6c8b4f82cca2 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -9325,6 +9325,13 @@ bool PointerExprEvaluator::VisitCastExpr(const CastExpr *E) { Result.IsNullPtr = false; return true; } else { + // In rare instances, the value isn't an lvalue. + // For example, when the value is the difference between the addresses of + // two labels. We reject that as a constant expression because we can't + // compute a valid offset to convert into a pointer. + if (!Value.isLValue()) + return false; + // Cast is of an lvalue, no need to change value. Result.setFrom(Info.Ctx, Value); return true; @@ -15126,6 +15133,62 @@ bool ComplexExprEvaluator::VisitCastExpr(const CastExpr *E) { llvm_unreachable("unknown cast resulting in complex value"); } +void HandleComplexComplexMul(APFloat A, APFloat B, APFloat C, APFloat D, + APFloat &ResR, APFloat &ResI) { + // This is an implementation of complex multiplication according to the + // constraints laid out in C11 Annex G. The implementation uses the + // following naming scheme: + // (a + ib) * (c + id) + + APFloat AC = A * C; + APFloat BD = B * D; + APFloat AD = A * D; + APFloat BC = B * C; + ResR = AC - BD; + ResI = AD + BC; + if (ResR.isNaN() && ResI.isNaN()) { + bool Recalc = false; + if (A.isInfinity() || B.isInfinity()) { + A = APFloat::copySign(APFloat(A.getSemantics(), A.isInfinity() ? 1 : 0), + A); + B = APFloat::copySign(APFloat(B.getSemantics(), B.isInfinity() ? 1 : 0), + B); + if (C.isNaN()) + C = APFloat::copySign(APFloat(C.getSemantics()), C); + if (D.isNaN()) + D = APFloat::copySign(APFloat(D.getSemantics()), D); + Recalc = true; + } + if (C.isInfinity() || D.isInfinity()) { + C = APFloat::copySign(APFloat(C.getSemantics(), C.isInfinity() ? 1 : 0), + C); + D = APFloat::copySign(APFloat(D.getSemantics(), D.isInfinity() ? 1 : 0), + D); + if (A.isNaN()) + A = APFloat::copySign(APFloat(A.getSemantics()), A); + if (B.isNaN()) + B = APFloat::copySign(APFloat(B.getSemantics()), B); + Recalc = true; + } + if (!Recalc && (AC.isInfinity() || BD.isInfinity() || AD.isInfinity() || + BC.isInfinity())) { + if (A.isNaN()) + A = APFloat::copySign(APFloat(A.getSemantics()), A); + if (B.isNaN()) + B = APFloat::copySign(APFloat(B.getSemantics()), B); + if (C.isNaN()) + C = APFloat::copySign(APFloat(C.getSemantics()), C); + if (D.isNaN()) + D = APFloat::copySign(APFloat(D.getSemantics()), D); + Recalc = true; + } + if (Recalc) { + ResR = APFloat::getInf(A.getSemantics()) * (A * C - B * D); + ResI = APFloat::getInf(A.getSemantics()) * (A * D + B * C); + } + } +} + bool ComplexExprEvaluator::VisitBinaryOperator(const BinaryOperator *E) { if (E->isPtrMemOp() || E->isAssignmentOp() || E->getOpcode() == BO_Comma) return ExprEvaluatorBaseTy::VisitBinaryOperator(E); @@ -15225,55 +15288,7 @@ bool ComplexExprEvaluator::VisitBinaryOperator(const BinaryOperator *E) { !handleFloatFloatBinOp(Info, E, ResI, BO_Mul, B)) return false; } else { - // In the fully general case, we need to handle NaNs and infinities - // robustly. - APFloat AC = A * C; - APFloat BD = B * D; - APFloat AD = A * D; - APFloat BC = B * C; - ResR = AC - BD; - ResI = AD + BC; - if (ResR.isNaN() && ResI.isNaN()) { - bool Recalc = false; - if (A.isInfinity() || B.isInfinity()) { - A = APFloat::copySign( - APFloat(A.getSemantics(), A.isInfinity() ? 1 : 0), A); - B = APFloat::copySign( - APFloat(B.getSemantics(), B.isInfinity() ? 1 : 0), B); - if (C.isNaN()) - C = APFloat::copySign(APFloat(C.getSemantics()), C); - if (D.isNaN()) - D = APFloat::copySign(APFloat(D.getSemantics()), D); - Recalc = true; - } - if (C.isInfinity() || D.isInfinity()) { - C = APFloat::copySign( - APFloat(C.getSemantics(), C.isInfinity() ? 1 : 0), C); - D = APFloat::copySign( - APFloat(D.getSemantics(), D.isInfinity() ? 1 : 0), D); - if (A.isNaN()) - A = APFloat::copySign(APFloat(A.getSemantics()), A); - if (B.isNaN()) - B = APFloat::copySign(APFloat(B.getSemantics()), B); - Recalc = true; - } - if (!Recalc && (AC.isInfinity() || BD.isInfinity() || - AD.isInfinity() || BC.isInfinity())) { - if (A.isNaN()) - A = APFloat::copySign(APFloat(A.getSemantics()), A); - if (B.isNaN()) - B = APFloat::copySign(APFloat(B.getSemantics()), B); - if (C.isNaN()) - C = APFloat::copySign(APFloat(C.getSemantics()), C); - if (D.isNaN()) - D = APFloat::copySign(APFloat(D.getSemantics()), D); - Recalc = true; - } - if (Recalc) { - ResR = APFloat::getInf(A.getSemantics()) * (A * C - B * D); - ResI = APFloat::getInf(A.getSemantics()) * (A * D + B * C); - } - } + HandleComplexComplexMul(A, B, C, D, ResR, ResI); } } else { ComplexValue LHS = Result; diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp index e61c0a70a0d8aa..6ffa91d81d0e13 100644 --- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp +++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp @@ -876,6 +876,22 @@ bool ByteCodeExprGen::VisitComplexBinOp(const BinaryOperator *E) { if (const auto *AT = RHSType->getAs()) RHSType = AT->getValueType(); + // For ComplexComplex Mul, we have special ops to make their implementation + // easier. + BinaryOperatorKind Op = E->getOpcode(); + if (Op == BO_Mul && LHSType->isAnyComplexType() && + RHSType->isAnyComplexType()) { + assert(classifyPrim(LHSType->getAs()->getElementType()) == + classifyPrim(RHSType->getAs()->getElementType())); + PrimType ElemT = + classifyPrim(LHSType->getAs()->getElementType()); + if (!this->visit(LHS)) + return false; + if (!this->visit(RHS)) + return false; + return this->emitMulc(ElemT, E); + } + // Evaluate LHS and save value to LHSOffset. bool LHSIsComplex; unsigned LHSOffset; @@ -919,22 +935,22 @@ bool ByteCodeExprGen::VisitComplexBinOp(const BinaryOperator *E) { // For both LHS and RHS, either load the value from the complex pointer, or // directly from the local variable. For index 1 (i.e. the imaginary part), // just load 0 and do the operation anyway. - auto loadComplexValue = [this](bool IsComplex, unsigned ElemIndex, - unsigned Offset, const Expr *E) -> bool { + auto loadComplexValue = [this](bool IsComplex, bool LoadZero, + unsigned ElemIndex, unsigned Offset, + const Expr *E) -> bool { if (IsComplex) { if (!this->emitGetLocal(PT_Ptr, Offset, E)) return false; return this->emitArrayElemPop(classifyComplexElementType(E->getType()), ElemIndex, E); } - if (ElemIndex == 0) + if (ElemIndex == 0 || !LoadZero) return this->emitGetLocal(classifyPrim(E->getType()), Offset, E); return this->visitZeroInitializer(classifyPrim(E->getType()), E->getType(), E); }; // Now we can get pointers to the LHS and RHS from the offsets above. - BinaryOperatorKind Op = E->getOpcode(); for (unsigned ElemIndex = 0; ElemIndex != 2; ++ElemIndex) { // Result pointer for the store later. if (!this->DiscardResult) { @@ -942,15 +958,14 @@ bool ByteCodeExprGen::VisitComplexBinOp(const BinaryOperator *E) { return false; } - if (!loadComplexValue(LHSIsComplex, ElemIndex, LHSOffset, LHS)) - return false; - - if (!loadComplexValue(RHSIsComplex, ElemIndex, RHSOffset, RHS)) - return false; - // The actual operation. switch (Op) { case BO_Add: + if (!loadComplexValue(LHSIsComplex, true, ElemIndex, LHSOffset, LHS)) + return false; + + if (!loadComplexValue(RHSIsComplex, true, ElemIndex, RHSOffset, RHS)) + return false; if (ResultElemT == PT_Float) { if (!this->emitAddf(getRoundingMode(E), E)) return false; @@ -960,6 +975,11 @@ bool ByteCodeExprGen::VisitComplexBinOp(const BinaryOperator *E) { } break; case BO_Sub: + if (!loadComplexValue(LHSIsComplex, true, ElemIndex, LHSOffset, LHS)) + return false; + + if (!loadComplexValue(RHSIsComplex, true, ElemIndex, RHSOffset, RHS)) + return false; if (ResultElemT == PT_Float) { if (!this->emitSubf(getRoundingMode(E), E)) return false; @@ -968,6 +988,21 @@ bool ByteCodeExprGen::VisitComplexBinOp(const BinaryOperator *E) { return false; } break; + case BO_Mul: + if (!loadComplexValue(LHSIsComplex, false, ElemIndex, LHSOffset, LHS)) + return false; + + if (!loadComplexValue(RHSIsComplex, false, ElemIndex, RHSOffset, RHS)) + return false; + + if (ResultElemT == PT_Float) { + if (!this->emitMulf(getRoundingMode(E), E)) + return false; + } else { + if (!this->emitMul(ResultElemT, E)) + return false; + } + break; default: return false; diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h index 1ce92798150c5d..44ff50b26133aa 100644 --- a/clang/lib/AST/Interp/Interp.h +++ b/clang/lib/AST/Interp/Interp.h @@ -13,6 +13,7 @@ #ifndef LLVM_CLANG_AST_INTERP_INTERP_H #define LLVM_CLANG_AST_INTERP_INTERP_H +#include "../ExprConstShared.h" #include "Boolean.h" #include "Floating.h" #include "Function.h" @@ -368,6 +369,62 @@ inline bool Mulf(InterpState &S, CodePtr OpPC, llvm::RoundingMode RM) { S.Stk.push(Result); return CheckFloatResult(S, OpPC, Result, Status); } + +template ::T> +inline bool Mulc(InterpState &S, CodePtr OpPC) { + const Pointer &RHS = S.Stk.pop(); + const Pointer &LHS = S.Stk.pop(); + const Pointer &Result = S.Stk.peek(); + + if constexpr (std::is_same_v) { + APFloat A = LHS.atIndex(0).deref().getAPFloat(); + APFloat B = LHS.atIndex(1).deref().getAPFloat(); + APFloat C = RHS.atIndex(0).deref().getAPFloat(); + APFloat D = RHS.atIndex(1).deref().getAPFloat(); + + APFloat ResR(A.getSemantics()); + APFloat ResI(A.getSemantics()); + HandleComplexComplexMul(A, B, C, D, ResR, ResI); + + // Copy into the result. + Result.atIndex(0).deref() = Floating(ResR); + Result.atIndex(0).initialize(); + Result.atIndex(1).deref() = Floating(ResI); + Result.atIndex(1).initialize(); + Result.initialize(); + } else { + // Integer element type. + const T &LHSR = LHS.atIndex(0).deref(); + const T &LHSI = LHS.atIndex(1).deref(); + const T &RHSR = RHS.atIndex(0).deref(); + const T &RHSI = RHS.atIndex(1).deref(); + unsigned Bits = LHSR.bitWidth(); + + // real(Result) = (real(LHS) * real(RHS)) - (imag(LHS) * imag(RHS)) + T A; + if (T::mul(LHSR, RHSR, Bits, &A)) + return false; + T B; + if (T::mul(LHSI, RHSI, Bits, &B)) + return false; + if (T::sub(A, B, Bits, &Result.atIndex(0).deref())) + return false; + Result.atIndex(0).initialize(); + + // imag(Result) = (real(LHS) * imag(RHS)) + (imag(LHS) * real(RHS)) + if (T::mul(LHSR, RHSI, Bits, &A)) + return false; + if (T::mul(LHSI, RHSR, Bits, &B)) + return false; + if (T::add(A, B, Bits, &Result.atIndex(1).deref())) + return false; + Result.atIndex(1).initialize(); + Result.initialize(); + } + + return true; +} + /// 1) Pops the RHS from the stack. /// 2) Pops the LHS from the stack. /// 3) Pushes 'LHS & RHS' on the stack diff --git a/clang/lib/AST/Interp/Opcodes.td b/clang/lib/AST/Interp/Opcodes.td index df362efd8b58b2..aa07858fdb3c23 100644 --- a/clang/lib/AST/Interp/Opcodes.td +++ b/clang/lib/AST/Interp/Opcodes.td @@ -526,6 +526,10 @@ def Sub : AluOpcode; def Subf : FloatOpcode; def Mul : AluOpcode; def Mulf : FloatOpcode; +def Mulc : Opcode { + let Types = [NumberTypeClass]; + let HasGroup = 1; +} def Rem : IntegerOpcode; def Div : IntegerOpcode; def Divf : FloatOpcode; diff --git a/clang/lib/CodeGen/CGCXXABI.h b/clang/lib/CodeGen/CGCXXABI.h index c7eccbd0095a94..104a20db8efaf2 100644 --- a/clang/lib/CodeGen/CGCXXABI.h +++ b/clang/lib/CodeGen/CGCXXABI.h @@ -274,8 +274,7 @@ class CGCXXABI { getAddrOfCXXCatchHandlerType(QualType Ty, QualType CatchHandlerType) = 0; virtual CatchTypeInfo getCatchAllTypeInfo(); - virtual bool shouldTypeidBeNullChecked(bool IsDeref, - QualType SrcRecordTy) = 0; + virtual bool shouldTypeidBeNullChecked(QualType SrcRecordTy) = 0; virtual void EmitBadTypeidCall(CodeGenFunction &CGF) = 0; virtual llvm::Value *EmitTypeid(CodeGenFunction &CGF, QualType SrcRecordTy, Address ThisPtr, diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp index 88e075bc6d750a..572ba84d22ef58 100644 --- a/clang/lib/CodeGen/CGDebugInfo.cpp +++ b/clang/lib/CodeGen/CGDebugInfo.cpp @@ -4874,40 +4874,6 @@ llvm::DILocalVariable *CGDebugInfo::EmitDeclare(const VarDecl *VD, return D; } -llvm::DIType *CGDebugInfo::CreateBindingDeclType(const BindingDecl *BD) { - llvm::DIFile *Unit = getOrCreateFile(BD->getLocation()); - - // If the declaration is bound to a bitfield struct field, its type may have a - // size that is different from its deduced declaration type's. - if (const MemberExpr *ME = dyn_cast(BD->getBinding())) { - if (const FieldDecl *FD = dyn_cast(ME->getMemberDecl())) { - if (FD->isBitField()) { - ASTContext &Context = CGM.getContext(); - const CGRecordLayout &RL = - CGM.getTypes().getCGRecordLayout(FD->getParent()); - const CGBitFieldInfo &Info = RL.getBitFieldInfo(FD); - - // Find an integer type with the same bitwidth as the bitfield size. If - // no suitable type is present in the target, give up on producing debug - // information as it would be wrong. It is certainly possible to produce - // correct debug info, but the logic isn't currently implemented. - uint64_t BitfieldSizeInBits = Info.Size; - QualType IntTy = - Context.getIntTypeForBitwidth(BitfieldSizeInBits, Info.IsSigned); - if (IntTy.isNull()) - return nullptr; - Qualifiers Quals = BD->getType().getQualifiers(); - QualType FinalTy = Context.getQualifiedType(IntTy, Quals); - llvm::DIType *Ty = getOrCreateType(FinalTy, Unit); - assert(Ty); - return Ty; - } - } - } - - return getOrCreateType(BD->getType(), Unit); -} - llvm::DILocalVariable *CGDebugInfo::EmitDeclare(const BindingDecl *BD, llvm::Value *Storage, std::optional ArgNo, @@ -4922,7 +4888,8 @@ llvm::DILocalVariable *CGDebugInfo::EmitDeclare(const BindingDecl *BD, if (isa(BD->getBinding())) return nullptr; - llvm::DIType *Ty = CreateBindingDeclType(BD); + llvm::DIFile *Unit = getOrCreateFile(BD->getLocation()); + llvm::DIType *Ty = getOrCreateType(BD->getType(), Unit); // If there is no debug info for this type then do not emit debug info // for this variable. @@ -4948,7 +4915,6 @@ llvm::DILocalVariable *CGDebugInfo::EmitDeclare(const BindingDecl *BD, unsigned Column = getColumnNumber(BD->getLocation()); StringRef Name = BD->getName(); auto *Scope = cast(LexicalBlockStack.back()); - llvm::DIFile *Unit = getOrCreateFile(BD->getLocation()); // Create the descriptor for the variable. llvm::DILocalVariable *D = DBuilder.createAutoVariable( Scope, Name, Unit, Line, Ty, CGM.getLangOpts().Optimize, @@ -4962,13 +4928,29 @@ llvm::DILocalVariable *CGDebugInfo::EmitDeclare(const BindingDecl *BD, const ASTRecordLayout &layout = CGM.getContext().getASTRecordLayout(parent); const uint64_t fieldOffset = layout.getFieldOffset(fieldIndex); - - if (fieldOffset != 0) { - // Currently if the field offset is not a multiple of byte, the produced - // location would not be accurate. Therefore give up. - if (fieldOffset % CGM.getContext().getCharWidth() != 0) - return nullptr; - + if (FD->isBitField()) { + const CGRecordLayout &RL = + CGM.getTypes().getCGRecordLayout(FD->getParent()); + const CGBitFieldInfo &Info = RL.getBitFieldInfo(FD); + // Use DW_OP_plus_uconst to adjust to the start of the bitfield + // storage. + if (!Info.StorageOffset.isZero()) { + Expr.push_back(llvm::dwarf::DW_OP_plus_uconst); + Expr.push_back(Info.StorageOffset.getQuantity()); + } + // Use LLVM_extract_bits to extract the appropriate bits from this + // bitfield. + Expr.push_back(Info.IsSigned + ? llvm::dwarf::DW_OP_LLVM_extract_bits_sext + : llvm::dwarf::DW_OP_LLVM_extract_bits_zext); + Expr.push_back(Info.Offset); + // If we have an oversized bitfield then the value won't be more than + // the size of the type. + const uint64_t TypeSize = CGM.getContext().getTypeSize(BD->getType()); + Expr.push_back(std::min((uint64_t)Info.Size, TypeSize)); + } else if (fieldOffset != 0) { + assert(fieldOffset % CGM.getContext().getCharWidth() == 0 && + "Unexpected non-bitfield with non-byte-aligned offset"); Expr.push_back(llvm::dwarf::DW_OP_plus_uconst); Expr.push_back( CGM.getContext().toCharUnitsFromBits(fieldOffset).getQuantity()); diff --git a/clang/lib/CodeGen/CGDebugInfo.h b/clang/lib/CodeGen/CGDebugInfo.h index da466837aa3c34..272c8d6e75965f 100644 --- a/clang/lib/CodeGen/CGDebugInfo.h +++ b/clang/lib/CodeGen/CGDebugInfo.h @@ -338,9 +338,6 @@ class CGDebugInfo { llvm::DIScope *RecordTy, const RecordDecl *RD); - /// Create type for binding declarations. - llvm::DIType *CreateBindingDeclType(const BindingDecl *BD); - /// Create an anonnymous zero-size separator for bit-field-decl if needed on /// the target. llvm::DIDerivedType *createBitFieldSeparatorIfNeeded( diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp index 3c4f59fc765fed..8eb6ab7381acbc 100644 --- a/clang/lib/CodeGen/CGExprCXX.cpp +++ b/clang/lib/CodeGen/CGExprCXX.cpp @@ -2143,40 +2143,9 @@ void CodeGenFunction::EmitCXXDeleteExpr(const CXXDeleteExpr *E) { } } -static bool isGLValueFromPointerDeref(const Expr *E) { - E = E->IgnoreParens(); - - if (const auto *CE = dyn_cast(E)) { - if (!CE->getSubExpr()->isGLValue()) - return false; - return isGLValueFromPointerDeref(CE->getSubExpr()); - } - - if (const auto *OVE = dyn_cast(E)) - return isGLValueFromPointerDeref(OVE->getSourceExpr()); - - if (const auto *BO = dyn_cast(E)) - if (BO->getOpcode() == BO_Comma) - return isGLValueFromPointerDeref(BO->getRHS()); - - if (const auto *ACO = dyn_cast(E)) - return isGLValueFromPointerDeref(ACO->getTrueExpr()) || - isGLValueFromPointerDeref(ACO->getFalseExpr()); - - // C++11 [expr.sub]p1: - // The expression E1[E2] is identical (by definition) to *((E1)+(E2)) - if (isa(E)) - return true; - - if (const auto *UO = dyn_cast(E)) - if (UO->getOpcode() == UO_Deref) - return true; - - return false; -} - static llvm::Value *EmitTypeidFromVTable(CodeGenFunction &CGF, const Expr *E, - llvm::Type *StdTypeInfoPtrTy) { + llvm::Type *StdTypeInfoPtrTy, + bool HasNullCheck) { // Get the vtable pointer. Address ThisPtr = CGF.EmitLValue(E).getAddress(); @@ -2189,16 +2158,11 @@ static llvm::Value *EmitTypeidFromVTable(CodeGenFunction &CGF, const Expr *E, CGF.EmitTypeCheck(CodeGenFunction::TCK_DynamicOperation, E->getExprLoc(), ThisPtr, SrcRecordTy); - // C++ [expr.typeid]p2: - // If the glvalue expression is obtained by applying the unary * operator to - // a pointer and the pointer is a null pointer value, the typeid expression - // throws the std::bad_typeid exception. - // - // However, this paragraph's intent is not clear. We choose a very generous - // interpretation which implores us to consider comma operators, conditional - // operators, parentheses and other such constructs. - if (CGF.CGM.getCXXABI().shouldTypeidBeNullChecked( - isGLValueFromPointerDeref(E), SrcRecordTy)) { + // Whether we need an explicit null pointer check. For example, with the + // Microsoft ABI, if this is a call to __RTtypeid, the null pointer check and + // exception throw is inside the __RTtypeid(nullptr) call + if (HasNullCheck && + CGF.CGM.getCXXABI().shouldTypeidBeNullChecked(SrcRecordTy)) { llvm::BasicBlock *BadTypeidBlock = CGF.createBasicBlock("typeid.bad_typeid"); llvm::BasicBlock *EndBlock = CGF.createBasicBlock("typeid.end"); @@ -2244,7 +2208,8 @@ llvm::Value *CodeGenFunction::EmitCXXTypeidExpr(const CXXTypeidExpr *E) { // type) to which the glvalue refers. // If the operand is already most derived object, no need to look up vtable. if (E->isPotentiallyEvaluated() && !E->isMostDerived(getContext())) - return EmitTypeidFromVTable(*this, E->getExprOperand(), PtrTy); + return EmitTypeidFromVTable(*this, E->getExprOperand(), PtrTy, + E->hasNullCheck()); QualType OperandTy = E->getExprOperand()->getType(); return MaybeASCast(CGM.GetAddrOfRTTIDescriptor(OperandTy)); diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp index c15176463866a7..5a3e83de625c9a 100644 --- a/clang/lib/CodeGen/ItaniumCXXABI.cpp +++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp @@ -178,7 +178,7 @@ class ItaniumCXXABI : public CodeGen::CGCXXABI { return CatchTypeInfo{getAddrOfRTTIDescriptor(Ty), 0}; } - bool shouldTypeidBeNullChecked(bool IsDeref, QualType SrcRecordTy) override; + bool shouldTypeidBeNullChecked(QualType SrcRecordTy) override; void EmitBadTypeidCall(CodeGenFunction &CGF) override; llvm::Value *EmitTypeid(CodeGenFunction &CGF, QualType SrcRecordTy, Address ThisPtr, @@ -1419,9 +1419,8 @@ static llvm::FunctionCallee getBadTypeidFn(CodeGenFunction &CGF) { return CGF.CGM.CreateRuntimeFunction(FTy, "__cxa_bad_typeid"); } -bool ItaniumCXXABI::shouldTypeidBeNullChecked(bool IsDeref, - QualType SrcRecordTy) { - return IsDeref; +bool ItaniumCXXABI::shouldTypeidBeNullChecked(QualType SrcRecordTy) { + return true; } void ItaniumCXXABI::EmitBadTypeidCall(CodeGenFunction &CGF) { diff --git a/clang/lib/CodeGen/MicrosoftCXXABI.cpp b/clang/lib/CodeGen/MicrosoftCXXABI.cpp index e4f798f6a97d97..9ab634fa6ce2e0 100644 --- a/clang/lib/CodeGen/MicrosoftCXXABI.cpp +++ b/clang/lib/CodeGen/MicrosoftCXXABI.cpp @@ -144,7 +144,7 @@ class MicrosoftCXXABI : public CGCXXABI { return CatchTypeInfo{nullptr, 0x40}; } - bool shouldTypeidBeNullChecked(bool IsDeref, QualType SrcRecordTy) override; + bool shouldTypeidBeNullChecked(QualType SrcRecordTy) override; void EmitBadTypeidCall(CodeGenFunction &CGF) override; llvm::Value *EmitTypeid(CodeGenFunction &CGF, QualType SrcRecordTy, Address ThisPtr, @@ -977,11 +977,9 @@ MicrosoftCXXABI::performBaseAdjustment(CodeGenFunction &CGF, Address Value, PolymorphicBase); } -bool MicrosoftCXXABI::shouldTypeidBeNullChecked(bool IsDeref, - QualType SrcRecordTy) { +bool MicrosoftCXXABI::shouldTypeidBeNullChecked(QualType SrcRecordTy) { const CXXRecordDecl *SrcDecl = SrcRecordTy->getAsCXXRecordDecl(); - return IsDeref && - !getContext().getASTRecordLayout(SrcDecl).hasExtendableVFPtr(); + return !getContext().getASTRecordLayout(SrcDecl).hasExtendableVFPtr(); } static llvm::CallBase *emitRTtypeidCall(CodeGenFunction &CGF, diff --git a/clang/lib/Sema/SemaExceptionSpec.cpp b/clang/lib/Sema/SemaExceptionSpec.cpp index 17acfca6b01126..67e0c7c63909ec 100644 --- a/clang/lib/Sema/SemaExceptionSpec.cpp +++ b/clang/lib/Sema/SemaExceptionSpec.cpp @@ -1114,21 +1114,10 @@ static CanThrowResult canTypeidThrow(Sema &S, const CXXTypeidExpr *DC) { if (DC->isTypeOperand()) return CT_Cannot; - Expr *Op = DC->getExprOperand(); - if (Op->isTypeDependent()) + if (DC->isValueDependent()) return CT_Dependent; - const RecordType *RT = Op->getType()->getAs(); - if (!RT) - return CT_Cannot; - - if (!cast(RT->getDecl())->isPolymorphic()) - return CT_Cannot; - - if (Op->Classify(S.Context).isPRValue()) - return CT_Cannot; - - return CT_Can; + return DC->hasNullCheck() ? CT_Can : CT_Cannot; } CanThrowResult Sema::canThrow(const Stmt *S) { @@ -1157,8 +1146,9 @@ CanThrowResult Sema::canThrow(const Stmt *S) { } case Expr::CXXTypeidExprClass: - // - a potentially evaluated typeid expression applied to a glvalue - // expression whose type is a polymorphic class type + // - a potentially evaluated typeid expression applied to a (possibly + // parenthesized) built-in unary * operator applied to a pointer to a + // polymorphic class type return canTypeidThrow(*this, cast(S)); // - a potentially evaluated call to a function, member function, function diff --git a/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp b/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp index c6f87b45ab887a..d8c257dbd731e9 100644 --- a/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp +++ b/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp @@ -2333,7 +2333,8 @@ inline ProgramStateRef EquivalenceClass::merge(RangeSet::Factory &F, // // The moment we introduce symbolic casts, this restriction can be // lifted. - if (getType() != Other.getType()) + if (getType()->getCanonicalTypeUnqualified() != + Other.getType()->getCanonicalTypeUnqualified()) return State; SymbolSet Members = getClassMembers(State); diff --git a/clang/test/AST/Interp/complex.cpp b/clang/test/AST/Interp/complex.cpp index 003f33e092d25f..7c72087585b27f 100644 --- a/clang/test/AST/Interp/complex.cpp +++ b/clang/test/AST/Interp/complex.cpp @@ -9,6 +9,37 @@ static_assert(&__imag z1 == &__real z1 + 1, ""); static_assert((*(&__imag z1)) == __imag z1, ""); static_assert((*(&__real z1)) == __real z1, ""); + +static_assert(((1.25 + 0.5j) * (0.25 - 0.75j)) == (0.6875 - 0.8125j), ""); +static_assert(((1.25 + 0.5j) * 0.25) == (0.3125 + 0.125j), ""); +static_assert((1.25 * (0.25 - 0.75j)) == (0.3125 - 0.9375j), ""); +constexpr _Complex float InfC = {1.0, __builtin_inf()}; +constexpr _Complex float InfInf = __builtin_inf() + InfC; +static_assert(__real__(InfInf) == __builtin_inf()); +static_assert(__imag__(InfInf) == __builtin_inf()); +static_assert(__builtin_isnan(__real__(InfInf * InfInf))); +static_assert(__builtin_isinf_sign(__imag__(InfInf * InfInf)) == 1); + +static_assert(__builtin_isinf_sign(__real__((__builtin_inf() + 1.0j) * 1.0)) == 1); +static_assert(__builtin_isinf_sign(__imag__((1.0 + InfC) * 1.0)) == 1); +static_assert(__builtin_isinf_sign(__real__(1.0 * (__builtin_inf() + 1.0j))) == 1); +static_assert(__builtin_isinf_sign(__imag__(1.0 * (1.0 + InfC))) == 1); +static_assert(__builtin_isinf_sign(__real__((__builtin_inf() + 1.0j) * (1.0 + 1.0j))) == 1); +static_assert(__builtin_isinf_sign(__real__((1.0 + 1.0j) * (__builtin_inf() + 1.0j))) == 1); +static_assert(__builtin_isinf_sign(__real__((__builtin_inf() + 1.0j) * (__builtin_inf() + 1.0j))) == 1); +static_assert(__builtin_isinf_sign(__real__((1.0 + InfC) * (1.0 + 1.0j))) == -1); +static_assert(__builtin_isinf_sign(__imag__((1.0 + InfC) * (1.0 + 1.0j))) == 1); +static_assert(__builtin_isinf_sign(__real__((1.0 + 1.0j) * (1.0 + InfC))) == -1); +static_assert(__builtin_isinf_sign(__imag__((1.0 + 1.0j) * (1.0 + InfC))) == 1); +static_assert(__builtin_isinf_sign(__real__((1.0 + InfC) * (1.0 + InfC))) == -1); +static_assert(__builtin_isinf_sign(__real__(InfInf * InfInf)) == 0); + +constexpr _Complex int IIMA = {1,2}; +constexpr _Complex int IIMB = {10,20}; +constexpr _Complex int IIMC = IIMA * IIMB; +static_assert(__real(IIMC) == -30, ""); +static_assert(__imag(IIMC) == 40, ""); + constexpr _Complex int Comma1 = {1, 2}; constexpr _Complex int Comma2 = (0, Comma1); static_assert(Comma1 == Comma1, ""); diff --git a/clang/test/Analysis/equality_tracking.c b/clang/test/Analysis/equality_tracking.c index e33b95700d0024..877486c9133f46 100644 --- a/clang/test/Analysis/equality_tracking.c +++ b/clang/test/Analysis/equality_tracking.c @@ -28,6 +28,22 @@ void zeroImpliesEquality(int a, int b) { clang_analyzer_eval(b != a); // expected-warning{{TRUE}} } +typedef int I32_A; +typedef int I32_B; +void zeroImpliesEqualityWithTypedef(I32_A a, I32_B b) { + clang_analyzer_eval((a - b) == 0); // expected-warning{{UNKNOWN}} + if ((a - b) == 0) { + clang_analyzer_eval(b != a); // expected-warning{{FALSE}} + clang_analyzer_eval(b == a); // expected-warning{{TRUE}} + clang_analyzer_eval(!(a != b)); // expected-warning{{TRUE}} + clang_analyzer_eval(!(b == a)); // expected-warning{{FALSE}} + return; + } + clang_analyzer_eval((a - b) == 0); // expected-warning{{FALSE}} + clang_analyzer_eval(b == a); // expected-warning{{FALSE}} + clang_analyzer_eval(b != a); // expected-warning{{TRUE}} +} + void zeroImpliesReversedEqual(int a, int b) { clang_analyzer_eval((b - a) == 0); // expected-warning{{UNKNOWN}} if ((b - a) == 0) { diff --git a/clang/test/Analysis/errno-stdlibraryfunctions.c b/clang/test/Analysis/errno-stdlibraryfunctions.c index a28efb764edfd4..657aa37a42670a 100644 --- a/clang/test/Analysis/errno-stdlibraryfunctions.c +++ b/clang/test/Analysis/errno-stdlibraryfunctions.c @@ -75,6 +75,30 @@ void errno_mkdtemp(char *template) { } } +typedef char* CHAR_PTR; +void errno_mkdtemp2(CHAR_PTR template) { + CHAR_PTR Dir = mkdtemp(template); + if (Dir == NULL) { + clang_analyzer_eval(errno != 0); // expected-warning{{TRUE}} + if (errno) {} // no warning + } else { + clang_analyzer_eval(Dir == template); // expected-warning{{TRUE}} + if (errno) {} // expected-warning{{An undefined value may be read from 'errno'}} + } +} + +typedef char const* CONST_CHAR_PTR; +void errno_mkdtemp3(CHAR_PTR template) { + CONST_CHAR_PTR Dir = mkdtemp(template); + if (Dir == NULL) { + clang_analyzer_eval(errno != 0); // expected-warning{{TRUE}} + if (errno) {} // no warning + } else { + clang_analyzer_eval(Dir == template); // expected-warning{{TRUE}} + if (errno) {} // expected-warning{{An undefined value may be read from 'errno'}} + } +} + void errno_getcwd(char *Buf, size_t Sz) { char *Path = getcwd(Buf, Sz); if (Sz == 0) { diff --git a/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p2.cpp b/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p2.cpp index e2397c12e2e995..693ca29370cf3f 100644 --- a/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p2.cpp +++ b/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p2.cpp @@ -1,16 +1,20 @@ -// RUN: %clang_cc1 -fsyntax-only -std=c++20 -verify -Wc++20-extensions %s -// RUN: %clang_cc1 -fsyntax-only -std=c++17 -verify -Wc++17-extensions %s -// RUN: %clang_cc1 -fsyntax-only -std=c++11 -verify -DEXT -Wc++17-extensions -Wc++20-extensions %s +// RUN: %clang_cc1 -fsyntax-only -std=c++11 -verify=expected,cxx11,cxx11-17 -pedantic %s +// RUN: %clang_cc1 -fsyntax-only -std=c++17 -verify=expected,cxx11-17,since-cxx17 -pedantic %s +// RUN: %clang_cc1 -fsyntax-only -std=c++20 -verify=expected,since-cxx17 -pedantic %s struct [[nodiscard]] S {}; +// cxx11-warning@-1 {{use of the 'nodiscard' attribute is a C++17 extension}} S get_s(); S& get_s_ref(); enum [[nodiscard]] E {}; +// cxx11-warning@-1 {{use of the 'nodiscard' attribute is a C++17 extension}} E get_e(); [[nodiscard]] int get_i(); +// cxx11-warning@-1 {{use of the 'nodiscard' attribute is a C++17 extension}} [[nodiscard]] volatile int &get_vi(); +// cxx11-warning@-1 {{use of the 'nodiscard' attribute is a C++17 extension}} void f() { get_s(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} @@ -27,6 +31,7 @@ void f() { } [[nodiscard]] volatile char &(*fp)(); // expected-warning {{'nodiscard' attribute only applies to functions, classes, or enumerations}} +// cxx11-warning@-1 {{use of the 'nodiscard' attribute is a C++17 extension}} void g() { fp(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} @@ -63,15 +68,20 @@ void f() { } // namespace PR31526 struct [[nodiscard("reason")]] ReasonStruct {}; +// cxx11-17-warning@-1 {{use of the 'nodiscard' attribute is a C++20 extension}} struct LaterReason; struct [[nodiscard("later reason")]] LaterReason {}; +// cxx11-17-warning@-1 {{use of the 'nodiscard' attribute is a C++20 extension}} ReasonStruct get_reason(); LaterReason get_later_reason(); [[nodiscard("another reason")]] int another_reason(); +// cxx11-17-warning@-1 {{use of the 'nodiscard' attribute is a C++20 extension}} [[nodiscard("conflicting reason")]] int conflicting_reason(); +// cxx11-17-warning@-1 {{use of the 'nodiscard' attribute is a C++20 extension}} [[nodiscard("special reason")]] int conflicting_reason(); +// cxx11-17-warning@-1 {{use of the 'nodiscard' attribute is a C++20 extension}} void cxx20_use() { get_reason(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute: reason}} @@ -82,17 +92,23 @@ void cxx20_use() { namespace p1771 { struct[[nodiscard("Don't throw me away!")]] ConvertTo{}; +// cxx11-17-warning@-1 {{use of the 'nodiscard' attribute is a C++20 extension}} struct S { [[nodiscard]] S(); + // cxx11-warning@-1 {{use of the 'nodiscard' attribute is a C++17 extension}} [[nodiscard("Don't let that S-Char go!")]] S(char); + // cxx11-17-warning@-1 {{use of the 'nodiscard' attribute is a C++20 extension}} S(int); [[gnu::warn_unused_result]] S(double); operator ConvertTo(); [[nodiscard]] operator int(); + // cxx11-warning@-1 {{use of the 'nodiscard' attribute is a C++17 extension}} [[nodiscard("Don't throw away as a double")]] operator double(); + // cxx11-17-warning@-1 {{use of the 'nodiscard' attribute is a C++20 extension}} }; struct[[nodiscard("Don't throw me away either!")]] Y{}; +// cxx11-17-warning@-1 {{use of the 'nodiscard' attribute is a C++20 extension}} void usage() { S(); // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}} @@ -103,42 +119,18 @@ void usage() { S s; ConvertTo{}; // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute: Don't throw me away!}} -// AST is different in C++20 mode, pre-2017 a move ctor for ConvertTo is there -// as well, hense the constructor warning. -#if __cplusplus >= 201703L -// expected-warning@+4 {{ignoring return value of function declared with 'nodiscard' attribute: Don't throw me away!}} -#else -// expected-warning@+2 {{ignoring temporary created by a constructor declared with 'nodiscard' attribute: Don't throw me away!}} -#endif + // AST is different in C++17 mode. Before, a move ctor for ConvertTo is there + // as well, hence the constructor warning. + + // since-cxx17-warning@+2 {{ignoring return value of function declared with 'nodiscard' attribute: Don't throw me away!}} + // cxx11-warning@+1 {{ignoring temporary created by a constructor declared with 'nodiscard' attribute: Don't throw me away!}} (ConvertTo) s; (int)s; // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} (S)'c'; // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute: Don't let that S-Char go!}} -#if __cplusplus >= 201703L -// expected-warning@+4 {{ignoring return value of function declared with 'nodiscard' attribute: Don't throw me away!}} -#else -// expected-warning@+2 {{ignoring temporary created by a constructor declared with 'nodiscard' attribute: Don't throw me away!}} -#endif + // since-cxx17-warning@+2 {{ignoring return value of function declared with 'nodiscard' attribute: Don't throw me away!}} + // cxx11-warning@+1 {{ignoring temporary created by a constructor declared with 'nodiscard' attribute: Don't throw me away!}} static_cast(s); static_cast(s); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} static_cast(s); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute: Don't throw away as a double}} } -}; // namespace p1771 - -#ifdef EXT -// expected-warning@5 {{use of the 'nodiscard' attribute is a C++17 extension}} -// expected-warning@9 {{use of the 'nodiscard' attribute is a C++17 extension}} -// expected-warning@12 {{use of the 'nodiscard' attribute is a C++17 extension}} -// expected-warning@13 {{use of the 'nodiscard' attribute is a C++17 extension}} -// expected-warning@29 {{use of the 'nodiscard' attribute is a C++17 extension}} -// expected-warning@65 {{use of the 'nodiscard' attribute is a C++20 extension}} -// expected-warning@67 {{use of the 'nodiscard' attribute is a C++20 extension}} -// expected-warning@71 {{use of the 'nodiscard' attribute is a C++20 extension}} -// expected-warning@73 {{use of the 'nodiscard' attribute is a C++20 extension}} -// expected-warning@74 {{use of the 'nodiscard' attribute is a C++20 extension}} -// expected-warning@84 {{use of the 'nodiscard' attribute is a C++20 extension}} -// expected-warning@86 {{use of the 'nodiscard' attribute is a C++17 extension}} -// expected-warning@87 {{use of the 'nodiscard' attribute is a C++20 extension}} -// expected-warning@91 {{use of the 'nodiscard' attribute is a C++17 extension}} -// expected-warning@92 {{use of the 'nodiscard' attribute is a C++20 extension}} -// expected-warning@95 {{use of the 'nodiscard' attribute is a C++20 extension}} -#endif +} // namespace p1771 diff --git a/clang/test/CXX/drs/cwg21xx.cpp b/clang/test/CXX/drs/cwg21xx.cpp index 082deb42e4fa09..d7bc52dd9d4464 100644 --- a/clang/test/CXX/drs/cwg21xx.cpp +++ b/clang/test/CXX/drs/cwg21xx.cpp @@ -11,6 +11,10 @@ // cxx98-error@-1 {{variadic macros are a C99 feature}} #endif +namespace std { +struct type_info; +} + namespace cwg2100 { // cwg2100: 12 template struct X {}; template struct A { @@ -231,6 +235,15 @@ static_assert(!__is_trivially_assignable(NonConstCopy &&, NonConstCopy &&), ""); #endif } // namespace cwg2171 +namespace cwg2191 { // cwg2191: 19 +#if __cplusplus >= 201103L +struct B { virtual void f() { } }; +struct D : B { } d; +static_assert(noexcept(typeid(d)), ""); +static_assert(!noexcept(typeid(*static_cast(nullptr))), ""); +#endif +} // namespace cwg2191 + namespace cwg2180 { // cwg2180: yes class A { A &operator=(const A &); // #cwg2180-A-copy diff --git a/clang/test/CodeGenCXX/debug-info-structured-binding-bitfield.cpp b/clang/test/CodeGenCXX/debug-info-structured-binding-bitfield.cpp index 5d9041e0475d24..e475f032f5ce34 100644 --- a/clang/test/CodeGenCXX/debug-info-structured-binding-bitfield.cpp +++ b/clang/test/CodeGenCXX/debug-info-structured-binding-bitfield.cpp @@ -8,8 +8,8 @@ struct S0 { // CHECK-LABEL: define dso_local void @_Z3fS0v // CHECK: alloca %struct.S0, align 4 // CHECK-NEXT: [[TMP0:%.*]] = alloca %struct.S0, align 4 -// CHECK: #dbg_declare(ptr [[TMP0]], [[S0_A:![0-9]+]], !DIExpression(), -// CHECK-NEXT: #dbg_declare(ptr [[TMP0]], [[S0_B:![0-9]+]], !DIExpression(DW_OP_plus_uconst, 2), +// CHECK: #dbg_declare(ptr [[TMP0]], [[S0_A:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 16), +// CHECK-NEXT: #dbg_declare(ptr [[TMP0]], [[S0_B:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 16, 16), // void fS0() { S0 s0; @@ -24,8 +24,8 @@ struct S1 { // CHECK-LABEL: define dso_local void @_Z3fS1v // CHECK: alloca %struct.S1, align 4 // CHECK-NEXT: [[TMP0:%.*]] = alloca %struct.S1, align 4 -// CHECK: #dbg_declare(ptr [[TMP0]], [[S1_A:![0-9]+]], !DIExpression(), -// CHECK-NEXT: #dbg_declare(ptr [[TMP0]], [[S1_B:![0-9]+]], !DIExpression(DW_OP_plus_uconst, 2), +// CHECK: #dbg_declare(ptr [[TMP0]], [[S1_A:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 16), +// CHECK-NEXT: #dbg_declare(ptr [[TMP0]], [[S1_B:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 16, 16), // void fS1() { S1 s1; @@ -40,8 +40,8 @@ struct S2 { // CHECK-LABEL: define dso_local void @_Z3fS2v // CHECK: alloca %struct.S2, align 4 // CHECK-NEXT: [[TMP0:%.*]] = alloca %struct.S2, align 4 -// CHECK: #dbg_declare(ptr [[TMP0]], [[S2_A:![0-9]+]], !DIExpression(), -// CHECK-NEXT: #dbg_declare(ptr [[TMP0]], [[S2_B:![0-9]+]], !DIExpression(DW_OP_plus_uconst, 1), +// CHECK: #dbg_declare(ptr [[TMP0]], [[S2_A:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 8), +// CHECK-NEXT: #dbg_declare(ptr [[TMP0]], [[S2_B:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 8, 8), // void fS2() { S2 s2; @@ -56,8 +56,8 @@ struct S3 { // CHECK-LABEL: define dso_local void @_Z3fS3v // CHECK: alloca %struct.S3, align 4 // CHECK-NEXT: [[TMP0:%.*]] = alloca %struct.S3, align 4 -// CHECK: #dbg_declare(ptr [[TMP0]], [[S3_A:![0-9]+]], !DIExpression(), -// CHECK-NEXT: #dbg_declare(ptr [[TMP0]], [[S3_B:![0-9]+]], !DIExpression(DW_OP_plus_uconst, 1), +// CHECK: #dbg_declare(ptr [[TMP0]], [[S3_A:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 8), +// CHECK-NEXT: #dbg_declare(ptr [[TMP0]], [[S3_B:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 8, 8), // void fS3() { S3 s3; @@ -72,8 +72,8 @@ struct S4 { // CHECK-LABEL: define dso_local void @_Z3fS4v // CHECK: alloca %struct.S4, align 4 // CHECK-NEXT: [[TMP0:%.*]] = alloca %struct.S4, align 4 -// CHECK: #dbg_declare(ptr [[TMP0]], [[S4_A:![0-9]+]], !DIExpression(), -// CHECK-NEXT: #dbg_declare(ptr [[TMP0]], [[S4_B:![0-9]+]], !DIExpression(DW_OP_plus_uconst, 1), +// CHECK: #dbg_declare(ptr [[TMP0]], [[S4_A:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 8), +// CHECK-NEXT: #dbg_declare(ptr [[TMP0]], [[S4_B:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 8, 16), // void fS4() { S4 s4; @@ -88,8 +88,8 @@ struct S5 { // CHECK-LABEL: define dso_local void @_Z3fS5v // CHECK: alloca %struct.S5, align 4 // CHECK-NEXT: [[TMP0:%.*]] = alloca %struct.S5, align 4 -// CHECK: #dbg_declare(ptr [[TMP0]], [[S5_A:![0-9]+]], !DIExpression(), -// CHECK-NEXT: #dbg_declare(ptr [[TMP0]], [[S5_B:![0-9]+]], !DIExpression(DW_OP_plus_uconst, 1), +// CHECK: #dbg_declare(ptr [[TMP0]], [[S5_A:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 8), +// CHECK-NEXT: #dbg_declare(ptr [[TMP0]], [[S5_B:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 8, 16), // void fS5() { S5 s5; @@ -104,8 +104,8 @@ struct S6 { // CHECK-LABEL: define dso_local void @_Z3fS6v // CHECK: alloca %struct.S6, align 4 // CHECK-NEXT: [[TMP0:%.*]] = alloca %struct.S6, align 4 -// CHECK: #dbg_declare(ptr [[TMP0]], [[S6_A:![0-9]+]], !DIExpression(), -// CHECK-NEXT: #dbg_declare(ptr [[TMP0]], [[S6_B:![0-9]+]], !DIExpression(DW_OP_plus_uconst, 2), +// CHECK: #dbg_declare(ptr [[TMP0]], [[S6_A:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 16), +// CHECK-NEXT: #dbg_declare(ptr [[TMP0]], [[S6_B:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 16, 8), // void fS6() { S6 s6; @@ -120,8 +120,8 @@ struct S7 { // CHECK-LABEL: define dso_local void @_Z3fS7v // CHECK: alloca %struct.S7, align 4 // CHECK-NEXT: [[TMP0:%.*]] = alloca %struct.S7, align 4 -// CHECK: #dbg_declare(ptr [[TMP0]], [[S7_A:![0-9]+]], !DIExpression(), -// CHECK-NEXT: #dbg_declare(ptr [[TMP0]], [[S7_B:![0-9]+]], !DIExpression(DW_OP_plus_uconst, 2), +// CHECK: #dbg_declare(ptr [[TMP0]], [[S7_A:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 16), +// CHECK-NEXT: #dbg_declare(ptr [[TMP0]], [[S7_B:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 16, 8), // void fS7() { S7 s7; @@ -136,8 +136,8 @@ struct S8 { // CHECK-LABEL: define dso_local void @_Z3fS8v // CHECK: alloca %struct.S8, align 4 // CHECK-NEXT: [[TMP0:%.*]] = alloca %struct.S8, align 4 -// CHECK: #dbg_declare(ptr [[TMP0]], [[S8_A:![0-9]+]], !DIExpression(), -// CHECK-NEXT: #dbg_declare(ptr [[TMP0]], [[S8_B:![0-9]+]], !DIExpression(DW_OP_plus_uconst, 2), +// CHECK: #dbg_declare(ptr [[TMP0]], [[S8_A:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 16), +// CHECK-NEXT: #dbg_declare(ptr [[TMP0]], [[S8_B:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 16, 16), // void fS8() { S8 s8; @@ -152,8 +152,8 @@ struct S9 { // CHECK-LABEL: define dso_local void @_Z3fS9v // CHECK: alloca %struct.S9, align 4 // CHECK-NEXT: [[TMP0:%.*]] = alloca %struct.S9, align 4 -// CHECK: #dbg_declare(ptr [[TMP0]], [[S9_A:![0-9]+]], !DIExpression(), -// CHECK-NEXT: #dbg_declare(ptr [[TMP0]], [[S9_B:![0-9]+]], !DIExpression(DW_OP_plus_uconst, 4), +// CHECK: #dbg_declare(ptr [[TMP0]], [[S9_A:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 16), +// CHECK-NEXT: #dbg_declare(ptr [[TMP0]], [[S9_B:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 32, 32), // void fS9() { S9 s9; @@ -167,8 +167,8 @@ struct S10 { // CHECK-LABEL: define dso_local void @_Z4fS10v // CHECK: alloca %struct.S10, align 4 // CHECK-NEXT: [[TMP0:%.*]] = alloca %struct.S10, align 4 -// CHECK: #dbg_declare(ptr [[TMP0]], [[S10_A:![0-9]+]], !DIExpression(), -// CHECK-NEXT: #dbg_declare(ptr [[TMP0]], [[S10_B:![0-9]+]], !DIExpression(DW_OP_plus_uconst, 1), +// CHECK: #dbg_declare(ptr [[TMP0]], [[S10_A:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 8), +// CHECK-NEXT: #dbg_declare(ptr [[TMP0]], [[S10_B:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 8, 8), // S10() : x(0), y(0) {} }; @@ -178,9 +178,6 @@ void fS10() { auto [a, b] = s10; } -// It's currently not possible to produce complete debug information for the following cases. -// Confirm that no wrong debug info is output. -// Once this is implemented, these tests should be amended. struct S11 { unsigned int x : 15; unsigned int y : 16; @@ -189,7 +186,8 @@ struct S11 { // CHECK-LABEL: define dso_local void @_Z4fS11v // CHECK: alloca %struct.S11, align 4 // CHECK-NEXT: [[TMP0:%.*]] = alloca %struct.S11, align 4 -// CHECK-NOT: #dbg_declare(ptr [[TMP0]] +// CHECK: #dbg_declare(ptr [[TMP0]], [[S11_A:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 15), +// CHECK-NEXT: #dbg_declare(ptr [[TMP0]], [[S11_B:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 15, 16), // void fS11() { S11 s11; @@ -204,8 +202,8 @@ struct S12 { // CHECK-LABEL: define dso_local void @_Z4fS12v // CHECK: alloca %struct.S12, align 4 // CHECK-NEXT: [[TMP0:%.*]] = alloca %struct.S12, align 4 -// CHECK: #dbg_declare(ptr [[TMP0]], [[S12_A:![0-9]+]], !DIExpression(), -// CHECK-NOT: #dbg_declare(ptr [[TMP0]] +// CHECK: #dbg_declare(ptr [[TMP0]], [[S12_A:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 16), +// CHECK-NEXT: #dbg_declare(ptr [[TMP0]], [[S12_B:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 32, 17), // void fS12() { S12 s12; @@ -220,63 +218,100 @@ struct __attribute__((packed)) S13 { // CHECK-LABEL: define dso_local void @_Z4fS13v // CHECK: alloca %struct.S13, align 1 // CHECK-NEXT: [[TMP0:%.*]] = alloca %struct.S13, align 1 -// CHECK-NOT: #dbg_declare(ptr [[TMP0]] +// CHECK: #dbg_declare(ptr [[TMP0]], [[S13_A:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 15), +// CHECK-NEXT: #dbg_declare(ptr [[TMP0]], [[S13_B:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 15, 16), // void fS13() { S13 s13; auto [a, b] = s13; } +struct S14 { + signed int x; + signed int y : 7; +}; + +// CHECK-LABEL: define dso_local void @_Z4fS14v +// CHECK: alloca %struct.S14, align 4 +// CHECK-NEXT: [[TMP0:%.*]] = alloca %struct.S14, align 4 +// CHECK: #dbg_declare(ptr [[TMP0]], [[S14_A:![0-9]+]], !DIExpression(), +// CHECK-NEXT: #dbg_declare(ptr [[TMP0]], [[S14_B:![0-9]+]], !DIExpression(DW_OP_plus_uconst, 4, DW_OP_LLVM_extract_bits_sext, 0, 7), +// +void fS14() { + S14 s14; + auto [a, b] = s14; +} + +struct S15 { + signed int x : 123; + unsigned int y : 987; +}; + +// CHECK-LABEL: define dso_local void @_Z4fS15v +// CHECK: alloca %struct.S15, align 8 +// CHECK-NEXT: [[TMP0:%.*]] = alloca %struct.S15, align 8 +// CHECK: #dbg_declare(ptr [[TMP0]], [[S15_A:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 32), +// CHECK-NEXT: #dbg_declare(ptr [[TMP0]], [[S15_B:![0-9]+]], !DIExpression(DW_OP_plus_uconst, 16, DW_OP_LLVM_extract_bits_zext, 0, 32), +// +void fS15() { + S15 s15; + auto [a, b] = s15; +} + // CHECK: [[UINT_TY:![0-9]+]] = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned) -// CHECK: [[S0_A]] = !DILocalVariable(name: "a", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[USHORT_TY:![0-9]+]]) -// CHECK: [[USHORT_TY]] = !DIBasicType(name: "unsigned short", size: 16, encoding: DW_ATE_unsigned) -// CHECK: [[S0_B]] = !DILocalVariable(name: "b", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[USHORT_TY]]) +// CHECK: [[S0_A]] = !DILocalVariable(name: "a", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[UINT_TY]]) +// CHECK: [[S0_B]] = !DILocalVariable(name: "b", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[UINT_TY]]) // CHECK: [[VOLATILE_UINT_TY:![0-9]+]] = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: [[UINT_TY]]) -// CHECK: [[S1_A]] = !DILocalVariable(name: "a", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[VOLATILE_USHORT_TY:![0-9]+]]) -// CHECK: [[VOLATILE_USHORT_TY]] = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: [[USHORT_TY]]) -// CHECK: [[S1_B]] = !DILocalVariable(name: "b", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[VOLATILE_USHORT_TY]]) +// CHECK: [[S1_A]] = !DILocalVariable(name: "a", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[VOLATILE_UINT_TY:![0-9]+]]) +// CHECK: [[S1_B]] = !DILocalVariable(name: "b", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[VOLATILE_UINT_TY]]) -// CHECK: [[S2_A]] = !DILocalVariable(name: "a", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[UCHAR_TY:![0-9]+]]) -// CHECK: [[UCHAR_TY]] = !DIBasicType(name: "unsigned char", size: 8, encoding: DW_ATE_unsigned_char) -// CHECK: [[S2_B]] = !DILocalVariable(name: "b", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[UCHAR_TY]]) +// CHECK: [[S2_A]] = !DILocalVariable(name: "a", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[UINT_TY]]) +// CHECK: [[S2_B]] = !DILocalVariable(name: "b", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[UINT_TY]]) -// CHECK: [[S3_A]] = !DILocalVariable(name: "a", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[VOLATILE_UCHAR_TY:![0-9]+]]) -// CHECK: [[VOLATILE_UCHAR_TY]] = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: [[UCHAR_TY]]) -// CHECK: [[S3_B]] = !DILocalVariable(name: "b", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[VOLATILE_UCHAR_TY]]) +// CHECK: [[S3_A]] = !DILocalVariable(name: "a", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[VOLATILE_UINT_TY]]) +// CHECK: [[S3_B]] = !DILocalVariable(name: "b", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[VOLATILE_UINT_TY]]) -// CHECK: [[S4_A]] = !DILocalVariable(name: "a", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[UCHAR_TY]]) -// CHECK: [[S4_B]] = !DILocalVariable(name: "b", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[USHORT_TY]]) +// CHECK: [[S4_A]] = !DILocalVariable(name: "a", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[UINT_TY]]) +// CHECK: [[S4_B]] = !DILocalVariable(name: "b", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[UINT_TY]]) -// CHECK: [[S5_A]] = !DILocalVariable(name: "a", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[VOLATILE_UCHAR_TY]]) -// CHECK: [[S5_B]] = !DILocalVariable(name: "b", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[VOLATILE_USHORT_TY]]) +// CHECK: [[S5_A]] = !DILocalVariable(name: "a", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[VOLATILE_UINT_TY]]) +// CHECK: [[S5_B]] = !DILocalVariable(name: "b", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[VOLATILE_UINT_TY]]) -// CHECK: [[S6_A]] = !DILocalVariable(name: "a", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[USHORT_TY]]) -// CHECK: [[S6_B]] = !DILocalVariable(name: "b", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[UCHAR_TY]]) +// CHECK: [[S6_A]] = !DILocalVariable(name: "a", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[UINT_TY]]) +// CHECK: [[S6_B]] = !DILocalVariable(name: "b", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[UINT_TY]]) -// CHECK: [[S7_A]] = !DILocalVariable(name: "a", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[VOLATILE_USHORT_TY]]) -// CHECK: [[S7_B]] = !DILocalVariable(name: "b", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[VOLATILE_UCHAR_TY]]) +// CHECK: [[S7_A]] = !DILocalVariable(name: "a", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[VOLATILE_UINT_TY]]) +// CHECK: [[S7_B]] = !DILocalVariable(name: "b", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[VOLATILE_UINT_TY]]) -// CHECK: [[S8_A]] = !DILocalVariable(name: "a", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[USHORT_TY]]) -// CHECK: [[S8_B]] = !DILocalVariable(name: "b", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[VOLATILE_USHORT_TY]]) +// CHECK: [[S8_A]] = !DILocalVariable(name: "a", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[UINT_TY]]) +// CHECK: [[S8_B]] = !DILocalVariable(name: "b", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[VOLATILE_UINT_TY]]) -// CHECK: [[S9_A]] = !DILocalVariable(name: "a", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[USHORT_TY]]) +// CHECK: [[S9_A]] = !DILocalVariable(name: "a", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[UINT_TY]]) // CHECK: [[S9_B]] = !DILocalVariable(name: "b", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[UINT_TY]]) // CHECK: [[CONST_UINT_TY:![0-9]+]] = !DIDerivedType(tag: DW_TAG_const_type, baseType: [[UINT_TY]]) -// CHECK: [[S10_A]] = !DILocalVariable(name: "a", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[CONST_UCHAR_TY:![0-9]+]]) -// CHECK: [[CONST_UCHAR_TY]] = !DIDerivedType(tag: DW_TAG_const_type, baseType: [[UCHAR_TY]]) -// CHECK: [[S10_B]] = !DILocalVariable(name: "b", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[CONST_VOLATILE_UCHAR_TY:![0-9]+]]) -// CHECK: [[CONST_VOLATILE_UCHAR_TY]] = !DIDerivedType(tag: DW_TAG_const_type, baseType: [[VOLATILE_UCHAR_TY]]) +// CHECK: [[CONST_VOLATILE_UINT_TY:![0-9]+]] = !DIDerivedType(tag: DW_TAG_const_type, baseType: [[VOLATILE_UINT_TY]]) +// CHECK: [[S10_A]] = !DILocalVariable(name: "a", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[CONST_UINT_TY]]) +// CHECK: [[S10_B]] = !DILocalVariable(name: "b", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[CONST_VOLATILE_UINT_TY]]) // S11 -// CHECK-NOT: !DILocalVariable(name: "a" -// CHECK-NOT: !DILocalVariable(name: "b" +// CHECK: [[S11_A]] = !DILocalVariable(name: "a", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[UINT_TY]]) +// CHECK: [[S11_B]] = !DILocalVariable(name: "b", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[UINT_TY]]) // S12 -// CHECK: [[S12_A]] = !DILocalVariable(name: "a", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[USHORT_TY]]) -// CHECK-NOT: !DILocalVariable(name: "b" +// CHECK: [[S12_A]] = !DILocalVariable(name: "a", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[UINT_TY]]) +// CHECK: [[S12_B]] = !DILocalVariable(name: "b", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[UINT_TY]]) // S13 -// CHECK-NOT: !DILocalVariable(name: "a" -// CHECK-NOT: !DILocalVariable(name: "b" \ No newline at end of file +// CHECK: [[S13_A]] = !DILocalVariable(name: "a", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[UINT_TY]]) +// CHECK: [[S13_B]] = !DILocalVariable(name: "b", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[UINT_TY]]) + +// S14 +// CHECK: [[SINT_TY:![0-9]+]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +// CHECK: [[S14_A]] = !DILocalVariable(name: "a", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[SINT_TY]]) +// CHECK: [[S14_B]] = !DILocalVariable(name: "b", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[SINT_TY]]) + +// S15 +// CHECK: [[S15_A]] = !DILocalVariable(name: "a", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[SINT_TY]]) +// CHECK: [[S15_B]] = !DILocalVariable(name: "b", scope: {{.*}}, file: {{.*}}, line: {{.*}}, type: [[UINT_TY]]) diff --git a/clang/test/Sema/integral-to-ptr.c b/clang/test/Sema/integral-to-ptr.c new file mode 100644 index 00000000000000..b8ab4cb79820d0 --- /dev/null +++ b/clang/test/Sema/integral-to-ptr.c @@ -0,0 +1,3 @@ +// RUN: %clang_cc1 %s -verify -fsyntax-only -std=c11 + +int x(void) { e: b: ; return &&e - &&b < x; } // expected-warning {{ordered comparison between pointer and integer}} diff --git a/clang/test/SemaCXX/warn-unused-value.cpp b/clang/test/SemaCXX/warn-unused-value.cpp index d964684069155e..2a07a0324f3f06 100644 --- a/clang/test/SemaCXX/warn-unused-value.cpp +++ b/clang/test/SemaCXX/warn-unused-value.cpp @@ -102,6 +102,16 @@ void f() { Bad b; (void)typeid(b.f()); // expected-warning {{expression with side effects will be evaluated despite being used as an operand to 'typeid'}} + extern Bad * pb; + // This typeid can throw but that is not a side-effect that we care about + // warning for since this is idiomatic code + (void)typeid(*pb); + (void)sizeof(typeid(*pb)); + (void)typeid(*++pb); // expected-warning {{expression with side effects will be evaluated despite being used as an operand to 'typeid'}} + (void)sizeof(typeid(*++pb)); // expected-warning {{expression with side effects has no effect in an unevaluated context}} + // FIXME: we should not warn about this in an unevaluated context + // expected-warning@-2 {{expression with side effects will be evaluated despite being used as an operand to 'typeid'}} + // A dereference of a volatile pointer is a side effecting operation, however // since it is idiomatic code, and the alternatives induce higher maintenance // costs, it is allowed. diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index c00d022b864464..dac38cedfcb751 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -12954,7 +12954,7 @@

C++ defect report implementation status

2191 C++17 Incorrect result for noexcept(typeid(v)) - Unknown + Clang 19 2192 diff --git a/flang/include/flang/Frontend/CodeGenOptions.h b/flang/include/flang/Frontend/CodeGenOptions.h index 918192abae7243..3bc5d93c4c43e7 100644 --- a/flang/include/flang/Frontend/CodeGenOptions.h +++ b/flang/include/flang/Frontend/CodeGenOptions.h @@ -56,6 +56,10 @@ class CodeGenOptions : public CodeGenOptionsBase { /// are offloading binaries containing device images and metadata. std::vector OffloadObjects; + /// List of filenames passed in using the -mlink-builtin-bitcode. These + /// are bc libraries that should be linked in and internalized; + std::vector BuiltinBCLibs; + /// The directory where temp files are stored if specified by -save-temps std::optional SaveTempsDir; diff --git a/flang/include/flang/Frontend/FrontendActions.h b/flang/include/flang/Frontend/FrontendActions.h index 7823565eb815f8..374fd76c8ae17d 100644 --- a/flang/include/flang/Frontend/FrontendActions.h +++ b/flang/include/flang/Frontend/FrontendActions.h @@ -223,9 +223,12 @@ class CodeGenAction : public FrontendAction { std::unique_ptr llvmCtx; std::unique_ptr llvmModule; - /// Embeds offload objects given with specified with -fembed-offload-object + /// Embeds offload objects specified with -fembed-offload-object void embedOffloadObjects(); + /// Links in BC libraries spefified with -mlink-builtin-bitcode + void linkBuiltinBCLibs(); + /// Runs pass pipeline to lower HLFIR into FIR void lowerHLFIRToFIR(); diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp index f64a939b785ef0..f96d72f1ad691b 100644 --- a/flang/lib/Frontend/CompilerInvocation.cpp +++ b/flang/lib/Frontend/CompilerInvocation.cpp @@ -347,6 +347,11 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, if (auto *a = args.getLastArg(clang::driver::options::OPT_save_temps_EQ)) opts.SaveTempsDir = a->getValue(); + // -mlink-builtin-bitcode + for (auto *a : + args.filtered(clang::driver::options::OPT_mlink_builtin_bitcode)) + opts.BuiltinBCLibs.push_back(a->getValue()); + // -mrelocation-model option. if (const llvm::opt::Arg *a = args.getLastArg(clang::driver::options::OPT_mrelocation_model)) { diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp index f8ca0cfd82e767..24db4df31279a3 100644 --- a/flang/lib/Frontend/FrontendActions.cpp +++ b/flang/lib/Frontend/FrontendActions.cpp @@ -43,6 +43,8 @@ #include "mlir/Target/LLVMIR/ModuleTranslation.h" #include "clang/Basic/Diagnostic.h" #include "clang/Basic/DiagnosticFrontend.h" +#include "clang/Basic/FileManager.h" +#include "clang/Basic/FileSystemOptions.h" #include "clang/Driver/DriverDiagnostic.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringRef.h" @@ -55,6 +57,7 @@ #include "llvm/IR/Verifier.h" #include "llvm/IRPrinter/IRPrintingPasses.h" #include "llvm/IRReader/IRReader.h" +#include "llvm/Linker/Linker.h" #include "llvm/Object/OffloadBinary.h" #include "llvm/Passes/PassBuilder.h" #include "llvm/Passes/PassPlugin.h" @@ -69,6 +72,7 @@ #include "llvm/Target/TargetMachine.h" #include "llvm/TargetParser/RISCVISAInfo.h" #include "llvm/TargetParser/RISCVTargetParser.h" +#include "llvm/Transforms/IPO/Internalize.h" #include "llvm/Transforms/Utils/ModuleUtils.h" #include #include @@ -1149,6 +1153,54 @@ void CodeGenAction::embedOffloadObjects() { } } +void CodeGenAction::linkBuiltinBCLibs() { + auto options = clang::FileSystemOptions(); + clang::FileManager fileManager(options); + CompilerInstance &ci = this->getInstance(); + const auto &cgOpts = ci.getInvocation().getCodeGenOpts(); + + std::vector> modules; + + // Load LLVM modules + for (llvm::StringRef bcLib : cgOpts.BuiltinBCLibs) { + auto BCBuf = fileManager.getBufferForFile(bcLib); + if (!BCBuf) { + auto diagID = ci.getDiagnostics().getCustomDiagID( + clang::DiagnosticsEngine::Error, "could not open '%0' for linking"); + ci.getDiagnostics().Report(diagID) << bcLib; + return; + } + + llvm::Expected> ModuleOrErr = + getOwningLazyBitcodeModule(std::move(*BCBuf), *llvmCtx); + if (!ModuleOrErr) { + auto diagID = ci.getDiagnostics().getCustomDiagID( + clang::DiagnosticsEngine::Error, "error loading '%0' for linking"); + ci.getDiagnostics().Report(diagID) << bcLib; + return; + } + modules.push_back(std::move(ModuleOrErr.get())); + } + + // Link modules and internalize functions + for (auto &module : modules) { + bool Err; + Err = llvm::Linker::linkModules( + *llvmModule, std::move(module), llvm::Linker::Flags::LinkOnlyNeeded, + [](llvm::Module &M, const llvm::StringSet<> &GVS) { + llvm::internalizeModule(M, [&GVS](const llvm::GlobalValue &GV) { + return !GV.hasName() || (GVS.count(GV.getName()) == 0); + }); + }); + if (Err) { + auto diagID = ci.getDiagnostics().getCustomDiagID( + clang::DiagnosticsEngine::Error, "link error when linking '%0'"); + ci.getDiagnostics().Report(diagID) << module->getSourceFileName(); + return; + } + } +} + static void reportOptRecordError(llvm::Error e, clang::DiagnosticsEngine &diags, const CodeGenOptions &codeGenOpts) { handleAllErrors( @@ -1240,6 +1292,10 @@ void CodeGenAction::executeAction() { llvmModule->setTargetTriple(theTriple); llvmModule->setDataLayout(targetMachine.createDataLayout()); + // Link in builtin bitcode libraries + if (!codeGenOpts.BuiltinBCLibs.empty()) + linkBuiltinBCLibs(); + // Embed offload objects specified with -fembed-offload-object if (!codeGenOpts.OffloadObjects.empty()) embedOffloadObjects(); diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index c73d43210a260a..24a57812ba104e 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -1801,14 +1801,16 @@ class FirConverter : public Fortran::lower::AbstractConverter { std::get_if(&x.u)) for (const Fortran::parser::Name &x : localInitList->v) info.localInitSymList.push_back(x.symbol); - if (const auto *reduceList = - std::get_if(&x.u)) { - fir::ReduceOperationEnum reduce_operation = getReduceOperationEnum( - std::get(reduceList->t)); - for (const Fortran::parser::Name &x : - std::get>(reduceList->t)) { - info.reduceSymList.push_back( - std::make_pair(reduce_operation, x.symbol)); + for (IncrementLoopInfo &info : incrementLoopNestInfo) { + if (const auto *reduceList = + std::get_if(&x.u)) { + fir::ReduceOperationEnum reduce_operation = getReduceOperationEnum( + std::get(reduceList->t)); + for (const Fortran::parser::Name &x : + std::get>(reduceList->t)) { + info.reduceSymList.push_back( + std::make_pair(reduce_operation, x.symbol)); + } } } if (const auto *sharedList = diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp index 80efa527382abc..70ff8b386ac3db 100644 --- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp +++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp @@ -60,8 +60,10 @@ DebugTypeGenerator::DebugTypeGenerator(mlir::ModuleOp m) // The debug information requires the offset of certain fields in the // descriptors like lower_bound and extent for each dimension. mlir::Type llvmDimsType = getDescFieldTypeModel()(context); + mlir::Type llvmPtrType = getDescFieldTypeModel()(context); dimsOffset = getComponentOffset(*dl, context, llvmDimsType); dimsSize = dl->getTypeSize(llvmDimsType); + ptrSize = dl->getTypeSize(llvmPtrType); } static mlir::LLVM::DITypeAttr genBasicType(mlir::MLIRContext *context, @@ -104,8 +106,8 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertBoxedSequenceType( // allocated = associated = (*base_addr != 0) mlir::LLVM::DIExpressionAttr valid = mlir::LLVM::DIExpressionAttr::get(context, ops); - mlir::LLVM::DIExpressionAttr associated = genAllocated ? valid : nullptr; - mlir::LLVM::DIExpressionAttr allocated = genAssociated ? valid : nullptr; + mlir::LLVM::DIExpressionAttr allocated = genAllocated ? valid : nullptr; + mlir::LLVM::DIExpressionAttr associated = genAssociated ? valid : nullptr; ops.clear(); llvm::SmallVector elements; @@ -217,6 +219,28 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertCharacterType( /*stringLengthExp=*/nullptr, /*stringLocationExp=*/nullptr, encoding); } +mlir::LLVM::DITypeAttr DebugTypeGenerator::convertPointerLikeType( + mlir::Type elTy, mlir::LLVM::DIFileAttr fileAttr, + mlir::LLVM::DIScopeAttr scope, mlir::Location loc, bool genAllocated, + bool genAssociated) { + mlir::MLIRContext *context = module.getContext(); + + // Arrays and character need different treatment because DWARF have special + // constructs for them to get the location from the descriptor. Rest of + // types are handled like pointer to underlying type. + if (auto seqTy = mlir::dyn_cast_or_null(elTy)) + return convertBoxedSequenceType(seqTy, fileAttr, scope, loc, genAllocated, + genAssociated); + + mlir::LLVM::DITypeAttr elTyAttr = convertType(elTy, fileAttr, scope, loc); + + return mlir::LLVM::DIDerivedTypeAttr::get( + context, llvm::dwarf::DW_TAG_pointer_type, + mlir::StringAttr::get(context, ""), elTyAttr, ptrSize, + /*alignInBits=*/0, /*offset=*/0, + /*optional
=*/std::nullopt, /*extra data=*/nullptr); +} + mlir::LLVM::DITypeAttr DebugTypeGenerator::convertType(mlir::Type Ty, mlir::LLVM::DIFileAttr fileAttr, mlir::LLVM::DIScopeAttr scope, @@ -258,6 +282,10 @@ DebugTypeGenerator::convertType(mlir::Type Ty, mlir::LLVM::DIFileAttr fileAttr, if (auto seqTy = mlir::dyn_cast_or_null(elTy)) return convertBoxedSequenceType(seqTy, fileAttr, scope, loc, false, false); + if (auto heapTy = mlir::dyn_cast_or_null(elTy)) + return convertPointerLikeType(heapTy.getElementType(), fileAttr, scope, + loc, /*genAllocated=*/true, + /*genAssociated=*/false); return genPlaceholderType(context); } else { // FIXME: These types are currently unhandled. We are generating a diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h index aa26694ab5114e..7816363e988214 100644 --- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h +++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h @@ -49,10 +49,17 @@ class DebugTypeGenerator { mlir::LLVM::DIFileAttr fileAttr, mlir::LLVM::DIScopeAttr scope, mlir::Location loc); + + mlir::LLVM::DITypeAttr + convertPointerLikeType(mlir::Type elTy, mlir::LLVM::DIFileAttr fileAttr, + mlir::LLVM::DIScopeAttr scope, mlir::Location loc, + bool genAllocated, bool genAssociated); + mlir::ModuleOp module; KindMapping kindMapping; std::uint64_t dimsSize; std::uint64_t dimsOffset; + std::uint64_t ptrSize; }; } // namespace fir diff --git a/flang/test/Driver/Inputs/libfun.f90 b/flang/test/Driver/Inputs/libfun.f90 new file mode 100644 index 00000000000000..1df7b61e8bc85b --- /dev/null +++ b/flang/test/Driver/Inputs/libfun.f90 @@ -0,0 +1,4 @@ +subroutine libfun(x) + integer :: x +end subroutine + diff --git a/flang/test/Driver/mlink-builtin-bc.f90 b/flang/test/Driver/mlink-builtin-bc.f90 new file mode 100644 index 00000000000000..e245c1493bbcc4 --- /dev/null +++ b/flang/test/Driver/mlink-builtin-bc.f90 @@ -0,0 +1,15 @@ +! Test -mlink-builtin-bitcode flag +! RUN: %flang -emit-llvm -c -o %t.bc %S/Inputs/libfun.f90 +! RUN: %flang_fc1 -emit-llvm -o - -mlink-builtin-bitcode %t.bc %s 2>&1 | FileCheck %s + +! CHECK: define internal void @libfun_ + +! RUN: not %flang_fc1 -emit-llvm -o - -mlink-builtin-bitcode %no-%t.bc %s 2>&1 | FileCheck %s --check-prefix=ERROR + +! ERROR: error: could not open {{.*}}.bc + +external libfun +parameter(i=1) +integer :: j +call libfun(j) +end program diff --git a/flang/test/Integration/debug-allocatable-1.f90 b/flang/test/Integration/debug-allocatable-1.f90 new file mode 100644 index 00000000000000..471c8cdb7d54eb --- /dev/null +++ b/flang/test/Integration/debug-allocatable-1.f90 @@ -0,0 +1,24 @@ +! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -o - | FileCheck %s + +subroutine ff(n, m) + integer n, m, i, j + integer, allocatable :: ar1(:, :) + real, allocatable :: sc + + allocate(ar1(n, m)) + allocate(sc) + sc = 3.14 + + print *, sc + print *, ar1 +end subroutine ff + + +! CHECK-DAG: !DILocalVariable(name: "ar1"{{.*}}type: ![[TY1:[0-9]+]]) +! CHECK-DAG: ![[TY1]] = !DICompositeType(tag: DW_TAG_array_type{{.*}}elements: ![[ELEMS2:[0-9]+]]{{.*}}dataLocation{{.*}}allocated: !DIExpression(DW_OP_push_object_address, DW_OP_deref, DW_OP_lit0, DW_OP_ne)) +! CHECK-DAG: ![[ELEMS2]] = !{![[ELEM1:[0-9]+]], ![[ELEM2:[0-9]+]]} +! CHECK-DAG: ![[ELEM1]] = !DISubrange +! CHECK-DAG: ![[ELEM2]] = !DISubrange +! CHECK-DAG: !DILocalVariable(name: "sc"{{.*}}type: ![[TY2:[0-9]+]]) +! CHECK-DAG: ![[TY2]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: ![[TY3:[0-9]+]]{{.*}}) +! CHECK-DAG: ![[TY3]] = !DIBasicType(name: "real"{{.*}}) diff --git a/flang/test/Lower/loops3.f90 b/flang/test/Lower/loops3.f90 index 2e62ee480ec8a6..78f39e1013082a 100644 --- a/flang/test/Lower/loops3.f90 +++ b/flang/test/Lower/loops3.f90 @@ -12,8 +12,8 @@ subroutine loop_test ! CHECK: %[[VAL_0:.*]] = fir.alloca f32 {bindc_name = "m", uniq_name = "_QFloop_testEm"} ! CHECK: %[[VAL_1:.*]] = fir.address_of(@_QFloop_testEsum) : !fir.ref - ! CHECK: fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} unordered { - ! CHECK: fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} unordered { + ! CHECK: fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} unordered reduce(#fir.reduce_attr -> %[[VAL_1:.*]] : !fir.ref, #fir.reduce_attr -> %[[VAL_0:.*]] : !fir.ref) { + ! CHECK: fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} unordered reduce(#fir.reduce_attr -> %[[VAL_1:.*]] : !fir.ref, #fir.reduce_attr -> %[[VAL_0:.*]] : !fir.ref) { ! CHECK: fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} unordered reduce(#fir.reduce_attr -> %[[VAL_1:.*]] : !fir.ref, #fir.reduce_attr -> %[[VAL_0:.*]] : !fir.ref) { do concurrent (i=1:5, j=1:5, k=1:5) local(tmp) reduce(+:sum) reduce(max:m) tmp = i + j + k diff --git a/flang/test/Transforms/debug-allocatable-1.fir b/flang/test/Transforms/debug-allocatable-1.fir new file mode 100644 index 00000000000000..fd0beaddcdb707 --- /dev/null +++ b/flang/test/Transforms/debug-allocatable-1.fir @@ -0,0 +1,26 @@ +// RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s | FileCheck %s + + +module attributes {dlti.dl_spec = #dlti.dl_spec<>} { + func.func private @_QFPff() { + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %0 = fir.undefined !fir.dscope + %1 = fir.alloca !fir.box>> {bindc_name = "ar2", uniq_name = "_QFFffEar2"} + %4 = fircg.ext_declare %1 {uniq_name = "_QFFffEar2"} : (!fir.ref>>>) -> !fir.ref>>> loc(#loc1) + %15 = fir.alloca !fir.box> {bindc_name = "sc", uniq_name = "_QFFffEsc"} + %18 = fircg.ext_declare %15 {uniq_name = "_QFFffEsc"} : (!fir.ref>>) -> !fir.ref>> loc(#loc2) + return + } loc(#loc3) +} + +#loc1 = loc("test.f90":3:3) +#loc2 = loc("test.f90":4:3) +#loc3 = loc("test.f90":1:3) + +// CHECK-DAG: #[[TY1:.*]] = #llvm.di_basic_type +// CHECK-DAG: #[[TY2:.*]] = #llvm.di_composite_type> +// CHECK-DAG: #[[TY3:.*]] = #llvm.di_derived_type + +// CHECK-DAG: #llvm.di_local_variable<{{.*}}name = "ar2"{{.*}}type = #[[TY2]]> +// CHECK-DAG: #llvm.di_local_variable<{{.*}}name = "sc"{{.*}}type = #[[TY3]]> diff --git a/libcxx/include/vector b/libcxx/include/vector index fb03f77dbe7671..ecefc6f6f5bcce 100644 --- a/libcxx/include/vector +++ b/libcxx/include/vector @@ -2983,7 +2983,9 @@ _LIBCPP_POP_MACROS # include # include # include +#if !defined(_LIBCPP_HAS_NO_LOCALIZATION) # include +#endif # include # include # include diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp index 037fae45242c6f..c3340745e69335 100644 --- a/lld/COFF/InputFiles.cpp +++ b/lld/COFF/InputFiles.cpp @@ -818,19 +818,6 @@ void ObjFile::initializeDependencies() { debugTypesObj = makeTpiSource(ctx, this); } -// Make a PDB path assuming the PDB is in the same folder as the OBJ -static std::string getPdbBaseName(ObjFile *file, StringRef tSPath) { - StringRef localPath = - !file->parentName.empty() ? file->parentName : file->getName(); - SmallString<128> path = sys::path::parent_path(localPath); - - // Currently, type server PDBs are only created by MSVC cl, which only runs - // on Windows, so we can assume type server paths are Windows style. - sys::path::append(path, - sys::path::filename(tSPath, sys::path::Style::windows)); - return std::string(path); -} - // The casing of the PDB path stamped in the OBJ can differ from the actual path // on disk. With this, we ensure to always use lowercase as a key for the // pdbInputFileInstances map, at least on Windows. @@ -843,17 +830,35 @@ static std::string normalizePdbPath(StringRef path) { } // If existing, return the actual PDB path on disk. -static std::optional findPdbPath(StringRef pdbPath, - ObjFile *dependentFile) { +static std::optional +findPdbPath(StringRef pdbPath, ObjFile *dependentFile, StringRef outputPath) { // Ensure the file exists before anything else. In some cases, if the path // points to a removable device, Driver::enqueuePath() would fail with an // error (EAGAIN, "resource unavailable try again") which we want to skip // silently. if (llvm::sys::fs::exists(pdbPath)) return normalizePdbPath(pdbPath); - std::string ret = getPdbBaseName(dependentFile, pdbPath); - if (llvm::sys::fs::exists(ret)) - return normalizePdbPath(ret); + + StringRef objPath = !dependentFile->parentName.empty() + ? dependentFile->parentName + : dependentFile->getName(); + + // Currently, type server PDBs are only created by MSVC cl, which only runs + // on Windows, so we can assume type server paths are Windows style. + StringRef pdbName = sys::path::filename(pdbPath, sys::path::Style::windows); + + // Check if the PDB is in the same folder as the OBJ. + SmallString<128> path; + sys::path::append(path, sys::path::parent_path(objPath), pdbName); + if (llvm::sys::fs::exists(path)) + return normalizePdbPath(path); + + // Check if the PDB is in the output folder. + path.clear(); + sys::path::append(path, sys::path::parent_path(outputPath), pdbName); + if (llvm::sys::fs::exists(path)) + return normalizePdbPath(path); + return std::nullopt; } @@ -865,7 +870,7 @@ PDBInputFile::~PDBInputFile() = default; PDBInputFile *PDBInputFile::findFromRecordPath(const COFFLinkerContext &ctx, StringRef path, ObjFile *fromFile) { - auto p = findPdbPath(path.str(), fromFile); + auto p = findPdbPath(path.str(), fromFile, ctx.config.outputFile); if (!p) return nullptr; auto it = ctx.pdbInputFileInstances.find(*p); @@ -931,7 +936,7 @@ std::optional ObjFile::getDILineInfo(uint32_t offset, } void ObjFile::enqueuePdbFile(StringRef path, ObjFile *fromFile) { - auto p = findPdbPath(path.str(), fromFile); + auto p = findPdbPath(path.str(), fromFile, ctx.config.outputFile); if (!p) return; auto it = ctx.pdbInputFileInstances.emplace(*p, nullptr); diff --git a/lld/test/COFF/pdb-type-server-simple.test b/lld/test/COFF/pdb-type-server-simple.test index e9757d187e2f10..93d66cde4f7129 100644 --- a/lld/test/COFF/pdb-type-server-simple.test +++ b/lld/test/COFF/pdb-type-server-simple.test @@ -27,6 +27,10 @@ Re-run with /DEBUG:GHASH RUN: lld-link a.obj b.obj -entry:main -debug:ghash -out:t.exe -pdb:t.pdb -nodefaultlib -summary -verbose RUN: llvm-pdbutil dump -symbols -types -ids -globals %t/t.pdb | FileCheck %s +Re-run with pdb from outputpath +RUN: mkdir -p libs +RUN: cp a.obj libs/a.obj && cp b.obj libs/b.obj +RUN: lld-link libs/a.obj libs/b.obj -entry:main -debug:ghash -out:t.exe -pdb:t.pdb -nodefaultlib -summary 2>&1 | FileCheck %s -check-prefix FAILURE-MISSING-PDBFILE CHECK-LABEL: Types (TPI Stream) CHECK: ============================================================ @@ -125,3 +129,6 @@ SUMMARY-NEXT: index total bytes count size SUMMARY-NEXT: 0x1006: 256 = 1 * 256 SUMMARY: Run llvm-pdbutil to print details about a particular record: SUMMARY-NEXT: llvm-pdbutil dump -ids -id-index 0x1006 t.pdb + +FAILURE-MISSING-PDBFILE-NOT: Cannot use debug info for '{{.*}}.obj' +FAILURE-MISSING-PDBFILE-NOT: failed to load reference '{{.*}}.pdb': no such file or directory diff --git a/lldb/include/lldb/Utility/SupportFile.h b/lldb/include/lldb/Utility/SupportFile.h index d65156cea768f4..21b986dcaba281 100644 --- a/lldb/include/lldb/Utility/SupportFile.h +++ b/lldb/include/lldb/Utility/SupportFile.h @@ -49,9 +49,6 @@ class SupportFile { /// Materialize the file to disk and return the path to that temporary file. virtual const FileSpec &Materialize() { return m_file_spec; } - /// Change the file name. - void Update(const FileSpec &file_spec) { m_file_spec = file_spec; } - protected: FileSpec m_file_spec; Checksum m_checksum; diff --git a/lldb/source/Symbol/LineEntry.cpp b/lldb/source/Symbol/LineEntry.cpp index 461399e0326e91..19e9bb561375b8 100644 --- a/lldb/source/Symbol/LineEntry.cpp +++ b/lldb/source/Symbol/LineEntry.cpp @@ -244,7 +244,9 @@ void LineEntry::ApplyFileMappings(lldb::TargetSP target_sp) { if (target_sp) { // Apply any file remappings to our file. if (auto new_file_spec = target_sp->GetSourcePathMap().FindFile( - original_file_sp->GetSpecOnly())) - file_sp->Update(*new_file_spec); + original_file_sp->GetSpecOnly())) { + file_sp = std::make_shared(*new_file_spec, + original_file_sp->GetChecksum()); + } } } diff --git a/lldb/test/API/lit.cfg.py b/lldb/test/API/lit.cfg.py index 1e99c8cb95d163..48c5f49e78e016 100644 --- a/lldb/test/API/lit.cfg.py +++ b/lldb/test/API/lit.cfg.py @@ -7,6 +7,7 @@ import shlex import shutil import subprocess +import sys import lit.formats @@ -64,7 +65,7 @@ def find_python_interpreter(): # If we're running in a virtual environment, we already have a copy of the # Python executable. - if "VIRTUAL_ENV" in config.environment: + if sys.prefix != sys.base_prefix: return None # Avoid doing any work if we already copied the binary. diff --git a/lldb/test/requirements.txt b/lldb/test/requirements.txt index 83fbe6a5e1b97b..708f2327a6a049 100644 --- a/lldb/test/requirements.txt +++ b/lldb/test/requirements.txt @@ -4,3 +4,4 @@ psutil>=5.9.4 # Pexpect tests are known not to work on Windows, so are disabled. # See llvm.org/pr22274. pexpect>=4.9.0; sys_platform != 'win32' +packaging diff --git a/llvm/docs/TableGen/ProgRef.rst b/llvm/docs/TableGen/ProgRef.rst index 45fda317ef6432..3919c2a9b1b621 100644 --- a/llvm/docs/TableGen/ProgRef.rst +++ b/llvm/docs/TableGen/ProgRef.rst @@ -1868,7 +1868,7 @@ and non-0 as true. result. A logical OR can be performed if all the arguments are either 0 or 1. -``!range([``\ *start*\ ``,]`` *end*\ ``[, ``\ *step*\ ``])`` +``!range([``\ *start*\ ``,]`` *end*\ ``[,``\ *step*\ ``])`` This operator produces half-open range sequence ``[start : end : step)`` as ``list``. *start* is ``0`` and *step* is ``1`` by default. *step* can be negative and cannot be 0. If *start* ``<`` *end* and *step* is negative, diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index bb2c76d3d97604..aee9935d7a9c29 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -8331,7 +8331,8 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) { if (OptimizeNoopCopyExpression(CI, *TLI, *DL)) return true; - if ((isa(I) || isa(I) || isa(I)) && + if ((isa(I) || isa(I) || isa(I) || + isa(I)) && TLI->optimizeExtendOrTruncateConversion( I, LI->getLoopFor(I->getParent()), *TTI)) return true; diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp index 1c76d72ed51525..d81fe54fe844cd 100644 --- a/llvm/lib/CodeGen/MachineLICM.cpp +++ b/llvm/lib/CodeGen/MachineLICM.cpp @@ -440,8 +440,7 @@ static void applyBitsNotInRegMaskToRegUnitsMask(const TargetRegisterInfo &TRI, break; // Check if we have a valid PhysReg that is set in the mask. - // FIXME: We shouldn't have to check for PhysReg. - if (PhysReg && ((Word >> Bit) & 1)) { + if ((Word >> Bit) & 1) { for (MCRegUnitIterator RUI(PhysReg, &TRI); RUI.isValid(); ++RUI) ClobberedRUs.reset(*RUI); } diff --git a/llvm/lib/CodeGen/RegUsageInfoCollector.cpp b/llvm/lib/CodeGen/RegUsageInfoCollector.cpp index 6657cf3c1ef4ab..ca5e0b428c4772 100644 --- a/llvm/lib/CodeGen/RegUsageInfoCollector.cpp +++ b/llvm/lib/CodeGen/RegUsageInfoCollector.cpp @@ -142,6 +142,9 @@ bool RegUsageInfoCollector::runOnMachineFunction(MachineFunction &MF) { RegMask[Reg / 32] &= ~(1u << Reg % 32); }; + // Don't include $noreg in any regmasks. + SetRegAsDefined(MCRegister::NoRegister); + // Some targets can clobber registers "inside" a call, typically in // linker-generated code. for (const MCPhysReg Reg : TRI->getIntraCallClobberedRegs(&MF)) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 3dcd724e0408df..254d63abdf8053 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -5213,6 +5213,7 @@ SDValue DAGCombiner::visitAVG(SDNode *N) { SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); SDLoc DL(N); + bool IsSigned = Opcode == ISD::AVGCEILS || Opcode == ISD::AVGFLOORS; // fold (avg c1, c2) if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1})) @@ -5248,33 +5249,19 @@ SDValue DAGCombiner::visitAVG(SDNode *N) { // fold avgu(zext(x), zext(y)) -> zext(avgu(x, y)) // fold avgs(sext(x), sext(y)) -> sext(avgs(x, y)) - if (sd_match( - N, m_BinOp(ISD::AVGFLOORU, m_ZExt(m_Value(X)), m_ZExt(m_Value(Y)))) && + if (!IsSigned && + sd_match(N, m_BinOp(Opcode, m_ZExt(m_Value(X)), m_ZExt(m_Value(Y)))) && X.getValueType() == Y.getValueType() && - hasOperation(ISD::AVGFLOORU, X.getValueType())) { - SDValue AvgFloorU = DAG.getNode(ISD::AVGFLOORU, DL, X.getValueType(), X, Y); - return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, AvgFloorU); + hasOperation(Opcode, X.getValueType())) { + SDValue AvgU = DAG.getNode(Opcode, DL, X.getValueType(), X, Y); + return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, AvgU); } - if (sd_match( - N, m_BinOp(ISD::AVGCEILU, m_ZExt(m_Value(X)), m_ZExt(m_Value(Y)))) && + if (IsSigned && + sd_match(N, m_BinOp(Opcode, m_SExt(m_Value(X)), m_SExt(m_Value(Y)))) && X.getValueType() == Y.getValueType() && - hasOperation(ISD::AVGCEILU, X.getValueType())) { - SDValue AvgCeilU = DAG.getNode(ISD::AVGCEILU, DL, X.getValueType(), X, Y); - return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, AvgCeilU); - } - if (sd_match( - N, m_BinOp(ISD::AVGFLOORS, m_SExt(m_Value(X)), m_SExt(m_Value(Y)))) && - X.getValueType() == Y.getValueType() && - hasOperation(ISD::AVGFLOORS, X.getValueType())) { - SDValue AvgFloorS = DAG.getNode(ISD::AVGFLOORS, DL, X.getValueType(), X, Y); - return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, AvgFloorS); - } - if (sd_match( - N, m_BinOp(ISD::AVGCEILS, m_SExt(m_Value(X)), m_SExt(m_Value(Y)))) && - X.getValueType() == Y.getValueType() && - hasOperation(ISD::AVGCEILS, X.getValueType())) { - SDValue AvgCeilS = DAG.getNode(ISD::AVGCEILS, DL, X.getValueType(), X, Y); - return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, AvgCeilS); + hasOperation(Opcode, X.getValueType())) { + SDValue AvgS = DAG.getNode(Opcode, DL, X.getValueType(), X, Y); + return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, AvgS); } // Fold avgflooru(x,y) -> avgceilu(x,y-1) iff y != 0 diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 394b741f1c1d0a..35871cc5ade7f8 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -15824,48 +15824,67 @@ bool AArch64TargetLowering::shouldSinkOperands( return false; } -static bool createTblShuffleForZExt(ZExtInst *ZExt, FixedVectorType *DstTy, - bool IsLittleEndian) { - Value *Op = ZExt->getOperand(0); - auto *SrcTy = cast(Op->getType()); - auto SrcWidth = cast(SrcTy->getElementType())->getBitWidth(); - auto DstWidth = cast(DstTy->getElementType())->getBitWidth(); +static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth, + unsigned NumElts, bool IsLittleEndian, + SmallVectorImpl &Mask) { if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth >= 64) return false; assert(DstWidth % SrcWidth == 0 && - "TBL lowering is not supported for a ZExt instruction with this " - "source & destination element type."); - unsigned ZExtFactor = DstWidth / SrcWidth; + "TBL lowering is not supported for a conversion instruction with this " + "source and destination element type."); + + unsigned Factor = DstWidth / SrcWidth; + unsigned MaskLen = NumElts * Factor; + + Mask.clear(); + Mask.resize(MaskLen, NumElts); + + unsigned SrcIndex = 0; + for (unsigned I = IsLittleEndian ? 0 : Factor - 1; I < MaskLen; I += Factor) + Mask[I] = SrcIndex++; + + return true; +} + +static Value *createTblShuffleForZExt(IRBuilderBase &Builder, Value *Op, + FixedVectorType *ZExtTy, + FixedVectorType *DstTy, + bool IsLittleEndian) { + auto *SrcTy = cast(Op->getType()); unsigned NumElts = SrcTy->getNumElements(); - IRBuilder<> Builder(ZExt); + auto SrcWidth = cast(SrcTy->getElementType())->getBitWidth(); + auto DstWidth = cast(DstTy->getElementType())->getBitWidth(); + SmallVector Mask; - // Create a mask that selects <0,...,Op[i]> for each lane of the destination - // vector to replace the original ZExt. This can later be lowered to a set of - // tbl instructions. - for (unsigned i = 0; i < NumElts * ZExtFactor; i++) { - if (IsLittleEndian) { - if (i % ZExtFactor == 0) - Mask.push_back(i / ZExtFactor); - else - Mask.push_back(NumElts); - } else { - if ((i + 1) % ZExtFactor == 0) - Mask.push_back((i - ZExtFactor + 1) / ZExtFactor); - else - Mask.push_back(NumElts); - } - } + if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts, IsLittleEndian, Mask)) + return nullptr; auto *FirstEltZero = Builder.CreateInsertElement( PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0)); Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask); Result = Builder.CreateBitCast(Result, DstTy); - if (DstTy != ZExt->getType()) - Result = Builder.CreateZExt(Result, ZExt->getType()); - ZExt->replaceAllUsesWith(Result); - ZExt->eraseFromParent(); - return true; + if (DstTy != ZExtTy) + Result = Builder.CreateZExt(Result, ZExtTy); + return Result; +} + +static Value *createTblShuffleForSExt(IRBuilderBase &Builder, Value *Op, + FixedVectorType *DstTy, + bool IsLittleEndian) { + auto *SrcTy = cast(Op->getType()); + auto SrcWidth = cast(SrcTy->getElementType())->getBitWidth(); + auto DstWidth = cast(DstTy->getElementType())->getBitWidth(); + + SmallVector Mask; + if (!createTblShuffleMask(SrcWidth, DstWidth, SrcTy->getNumElements(), + !IsLittleEndian, Mask)) + return nullptr; + + auto *FirstEltZero = Builder.CreateInsertElement( + PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0)); + + return Builder.CreateShuffleVector(Op, FirstEltZero, Mask); } static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) { @@ -16030,21 +16049,45 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion( DstTy = TruncDstType; } - - return createTblShuffleForZExt(ZExt, DstTy, Subtarget->isLittleEndian()); + IRBuilder<> Builder(ZExt); + Value *Result = createTblShuffleForZExt( + Builder, ZExt->getOperand(0), cast(ZExt->getType()), + DstTy, Subtarget->isLittleEndian()); + if (!Result) + return false; + ZExt->replaceAllUsesWith(Result); + ZExt->eraseFromParent(); + return true; } auto *UIToFP = dyn_cast(I); if (UIToFP && SrcTy->getElementType()->isIntegerTy(8) && DstTy->getElementType()->isFloatTy()) { IRBuilder<> Builder(I); - auto *ZExt = cast( - Builder.CreateZExt(I->getOperand(0), VectorType::getInteger(DstTy))); + Value *ZExt = createTblShuffleForZExt( + Builder, I->getOperand(0), FixedVectorType::getInteger(DstTy), + FixedVectorType::getInteger(DstTy), Subtarget->isLittleEndian()); + assert(ZExt && "Cannot fail for the i8 to float conversion"); auto *UI = Builder.CreateUIToFP(ZExt, DstTy); I->replaceAllUsesWith(UI); I->eraseFromParent(); - return createTblShuffleForZExt(ZExt, cast(ZExt->getType()), - Subtarget->isLittleEndian()); + return true; + } + + auto *SIToFP = dyn_cast(I); + if (SIToFP && SrcTy->getElementType()->isIntegerTy(8) && + DstTy->getElementType()->isFloatTy()) { + IRBuilder<> Builder(I); + auto *Shuffle = createTblShuffleForSExt(Builder, I->getOperand(0), + FixedVectorType::getInteger(DstTy), + Subtarget->isLittleEndian()); + assert(Shuffle && "Cannot fail for the i8 to float conversion"); + auto *Cast = Builder.CreateBitCast(Shuffle, VectorType::getInteger(DstTy)); + auto *AShr = Builder.CreateAShr(Cast, 24, "", true); + auto *SI = Builder.CreateSIToFP(AShr, DstTy); + I->replaceAllUsesWith(SI); + I->eraseFromParent(); + return true; } // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index d0d7a9dc172474..65c4abef2bf8aa 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1168,8 +1168,7 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12", FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureA16, FeatureFastDenormalF32, FeatureG16, FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, - FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast, - FeatureMaxHardClauseLength32 + FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast ] >; @@ -1572,7 +1571,8 @@ def FeatureISAVersion12 : FeatureSet< FeatureHasRestrictedSOffset, FeatureVGPRSingleUseHintInsts, FeatureScalarDwordx3Loads, - FeatureDPPSrc1SGPR]>; + FeatureDPPSrc1SGPR, + FeatureMaxHardClauseLength32]>; def FeatureISAVersion12_Generic: FeatureSet< !listconcat(FeatureISAVersion12.Features, diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 77fd967364d021..973b6b8cce1772 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1659,6 +1659,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, }); } + if (ST.hasAtomicBufferGlobalPkAddF16Insts()) + Atomic.legalFor({{V2F16, GlobalPtr}}); + if (ST.hasAtomicGlobalPkAddBF16Inst()) + Atomic.legalFor({{V2BF16, GlobalPtr}}); + if (ST.hasAtomicFlatPkAdd16Insts()) + Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}}); + // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output // demarshalling getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) @@ -6013,6 +6020,8 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16: case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16; case Intrinsic::amdgcn_raw_buffer_atomic_fmin: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 818cbde5924324..99e3a8084f067e 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1645,6 +1645,7 @@ defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgc let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in { defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>; defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>; } let OtherPredicates = [HasBufferFlatGlobalAtomicsF64] in { @@ -1669,13 +1670,16 @@ defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat } let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in { +// FIXME: These do not have signed offsets defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "flat_addrspace", v2f16>; defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>; +defm : FlatSignedAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_flat", v2f16>; +defm : FlatSignedAtomicPat <"FLAT_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_flat", v2bf16>; } let OtherPredicates = [HasAtomicGlobalPkAddBF16Inst] in defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>; - +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_global", v2bf16>; } // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 0a93428a41b18f..d9a163ded6bab7 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -8841,6 +8841,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16: return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD_BF16); case Intrinsic::amdgcn_raw_buffer_atomic_fmin: @@ -15943,6 +15944,16 @@ static bool isHalf2OrBFloat2(Type *Ty) { return false; } +static bool isHalf2(Type *Ty) { + FixedVectorType *VT = dyn_cast(Ty); + return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy(); +} + +static bool isBFloat2(Type *Ty) { + FixedVectorType *VT = dyn_cast(Ty); + return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy(); +} + TargetLowering::AtomicExpansionKind SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { unsigned AS = RMW->getPointerAddressSpace(); @@ -16011,10 +16022,29 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { AS != AMDGPUAS::BUFFER_FAT_POINTER) return AtomicExpansionKind::CmpXChg; - // TODO: gfx940 supports v2f16 and v2bf16 if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy())) return AtomicExpansionKind::None; + if (AS == AMDGPUAS::FLAT_ADDRESS) { + // gfx940, gfx12 + // FIXME: Needs to account for no fine-grained memory + if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty)) + return AtomicExpansionKind::None; + } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) { + // gfx90a, gfx940, gfx12 + // FIXME: Needs to account for no fine-grained memory + if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty)) + return AtomicExpansionKind::None; + + // gfx940, gfx12 + // FIXME: Needs to account for no fine-grained memory + if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty)) + return AtomicExpansionKind::None; + } + + // TODO: Handle buffer case. gfx90a and gfx940 supports <2 x half>. gfx12 + // supports <2 x half> and <2 x bfloat>. + if (unsafeFPAtomicsDisabled(RMW->getFunction())) return AtomicExpansionKind::CmpXChg; diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp index e7c13f180f64e6..151a7821f835da 100644 --- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp +++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp @@ -404,7 +404,9 @@ DemandedFields getDemanded(const MachineInstr &MI, const RISCVSubtarget *ST) { if (RISCVII::hasSEWOp(TSFlags)) { Res.demandVTYPE(); if (RISCVII::hasVLOp(TSFlags)) - Res.demandVL(); + if (const MachineOperand &VLOp = MI.getOperand(getVLOpNum(MI)); + !VLOp.isReg() || !VLOp.isUndef()) + Res.demandVL(); // Behavior is independent of mask policy. if (!RISCVII::usesMaskPolicy(TSFlags)) @@ -524,7 +526,6 @@ class VSETVLIInfo { AVLIsReg, AVLIsImm, AVLIsVLMAX, - AVLIsIgnored, Unknown, } State = Uninitialized; @@ -564,12 +565,9 @@ class VSETVLIInfo { void setAVLVLMAX() { State = AVLIsVLMAX; } - void setAVLIgnored() { State = AVLIsIgnored; } - bool hasAVLImm() const { return State == AVLIsImm; } bool hasAVLReg() const { return State == AVLIsReg; } bool hasAVLVLMAX() const { return State == AVLIsVLMAX; } - bool hasAVLIgnored() const { return State == AVLIsIgnored; } Register getAVLReg() const { assert(hasAVLReg() && AVLRegDef.DefReg.isVirtual()); return AVLRegDef.DefReg; @@ -600,8 +598,6 @@ class VSETVLIInfo { setAVLRegDef(Info.getAVLVNInfo(), Info.getAVLReg()); else if (Info.hasAVLVLMAX()) setAVLVLMAX(); - else if (Info.hasAVLIgnored()) - setAVLIgnored(); else { assert(Info.hasAVLImm()); setAVLImm(Info.getAVLImm()); @@ -622,8 +618,6 @@ class VSETVLIInfo { } if (hasAVLVLMAX()) return true; - if (hasAVLIgnored()) - return false; return false; } @@ -645,9 +639,6 @@ class VSETVLIInfo { if (hasAVLVLMAX()) return Other.hasAVLVLMAX() && hasSameVLMAX(Other); - if (hasAVLIgnored()) - return Other.hasAVLIgnored(); - return false; } @@ -821,8 +812,6 @@ class VSETVLIInfo { OS << "AVLImm=" << (unsigned)AVLImm; if (hasAVLVLMAX()) OS << "AVLVLMAX"; - if (hasAVLIgnored()) - OS << "AVLIgnored"; OS << ", " << "VLMul=" << (unsigned)VLMul << ", " << "SEW=" << (unsigned)SEW << ", " @@ -938,7 +927,8 @@ RISCVInsertVSETVLI::getInfoForVSETVLI(const MachineInstr &MI) const { if (AVLReg == RISCV::X0) NewInfo.setAVLVLMAX(); else if (MI.getOperand(1).isUndef()) - NewInfo.setAVLIgnored(); + // Otherwise use an AVL of 1 to avoid depending on previous vl. + NewInfo.setAVLImm(1); else { VNInfo *VNI = getVNInfoFromReg(AVLReg, MI, LIS); NewInfo.setAVLRegDef(VNI, AVLReg); @@ -1014,17 +1004,17 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const { else InstrInfo.setAVLImm(Imm); } else if (VLOp.isUndef()) { - InstrInfo.setAVLIgnored(); + // Otherwise use an AVL of 1 to avoid depending on previous vl. + InstrInfo.setAVLImm(1); } else { VNInfo *VNI = getVNInfoFromReg(VLOp.getReg(), MI, LIS); InstrInfo.setAVLRegDef(VNI, VLOp.getReg()); } } else { assert(isScalarExtractInstr(MI)); - // TODO: If we are more clever about x0,x0 insertion then we should be able - // to deduce that the VL is ignored based off of DemandedFields, and remove - // the AVLIsIgnored state. Then we can just use an arbitrary immediate AVL. - InstrInfo.setAVLIgnored(); + // Pick a random value for state tracking purposes, will be ignored via + // the demanded fields mechanism + InstrInfo.setAVLImm(1); } #ifndef NDEBUG if (std::optional EEW = getEEWForLoadStore(MI)) { @@ -1104,28 +1094,6 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB, return; } - if (Info.hasAVLIgnored()) { - // We can only use x0, x0 if there's no chance of the vtype change causing - // the previous vl to become invalid. - if (PrevInfo.isValid() && !PrevInfo.isUnknown() && - Info.hasSameVLMAX(PrevInfo)) { - auto MI = BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETVLIX0)) - .addReg(RISCV::X0, RegState::Define | RegState::Dead) - .addReg(RISCV::X0, RegState::Kill) - .addImm(Info.encodeVTYPE()) - .addReg(RISCV::VL, RegState::Implicit); - LIS->InsertMachineInstrInMaps(*MI); - return; - } - // Otherwise use an AVL of 1 to avoid depending on previous vl. - auto MI = BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETIVLI)) - .addReg(RISCV::X0, RegState::Define | RegState::Dead) - .addImm(1) - .addImm(Info.encodeVTYPE()); - LIS->InsertMachineInstrInMaps(*MI); - return; - } - if (Info.hasAVLVLMAX()) { Register DestReg = MRI->createVirtualRegister(&RISCV::GPRRegClass); auto MI = BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETVLIX0)) @@ -1534,11 +1502,6 @@ void RISCVInsertVSETVLI::doPRE(MachineBasicBlock &MBB) { return; } - // If the AVL isn't used in its predecessors then bail, since we have no AVL - // to insert a vsetvli with. - if (AvailableInfo.hasAVLIgnored()) - return; - // Model the effect of changing the input state of the block MBB to // AvailableInfo. We're looking for two issues here; one legality, // one profitability. diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 64cacd74153fe1..de0144331dba3c 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -4262,6 +4262,37 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, } if (ISD != ISD::DELETED_NODE) { + auto adjustTableCost = [&](int ISD, unsigned Cost, + std::pair LT, + FastMathFlags FMF) -> InstructionCost { + InstructionCost LegalizationCost = LT.first; + MVT MTy = LT.second; + + // If there are no NANs to deal with, then these are reduced to a + // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we + // assume is used in the non-fast case. + if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) { + if (FMF.noNaNs()) + return LegalizationCost * 1; + } + + // For cases where some ops can be folded into a load/store, assume free. + if (MTy.isScalarInteger()) { + if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) { + if (const Instruction *II = ICA.getInst()) { + if (II->hasOneUse() && isa(II->user_back())) + return TTI::TCC_Free; + if (auto *LI = dyn_cast(II->getOperand(0))) { + if (LI->hasOneUse()) + return TTI::TCC_Free; + } + } + } + } + + return LegalizationCost * (int)Cost; + }; + // Legalize the type. std::pair LT = getTypeLegalizationCost(OpTy); MVT MTy = LT.second; @@ -4280,180 +4311,132 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize) return LT.first; - auto adjustTableCost = [](int ISD, unsigned Cost, - InstructionCost LegalizationCost, - FastMathFlags FMF) { - // If there are no NANs to deal with, then these are reduced to a - // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we - // assume is used in the non-fast case. - if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) { - if (FMF.noNaNs()) - return LegalizationCost * 1; - } - return LegalizationCost * (int)Cost; - }; - if (ST->useGLMDivSqrtCosts()) if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->useSLMArithCosts()) if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasVBMI2()) if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasBITALG()) if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasVPOPCNTDQ()) if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasGFNI()) if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasCDI()) if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasBWI()) if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasAVX512()) if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasXOP()) if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasAVX2()) if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasAVX()) if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasSSE42()) if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasSSE41()) if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasSSSE3()) if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasSSE2()) if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasSSE1()) if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasBMI()) { if (ST->is64Bit()) if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); } if (ST->hasLZCNT()) { if (ST->is64Bit()) if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); } if (ST->hasPOPCNT()) { if (ST->is64Bit()) if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); - } - - if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) { - if (const Instruction *II = ICA.getInst()) { - if (II->hasOneUse() && isa(II->user_back())) - return TTI::TCC_Free; - if (auto *LI = dyn_cast(II->getOperand(0))) { - if (LI->hasOneUse()) - return TTI::TCC_Free; - } - } + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); } if (ST->is64Bit()) if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); } return BaseT::getIntrinsicInstrCost(ICA, CostKind); diff --git a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp index d7613bce4c52ea..11368e3375bddd 100644 --- a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp @@ -14,6 +14,7 @@ #include "llvm/Transforms/IPO/SampleProfileMatcher.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/Support/CommandLine.h" using namespace llvm; using namespace sampleprof; @@ -24,6 +25,11 @@ extern cl::opt SalvageStaleProfile; extern cl::opt PersistProfileStaleness; extern cl::opt ReportProfileStaleness; +static cl::opt SalvageStaleProfileMaxCallsites( + "salvage-stale-profile-max-callsites", cl::Hidden, cl::init(UINT_MAX), + cl::desc("The maximum number of callsites in a function, above which stale " + "profile matching will be skipped.")); + void SampleProfileMatcher::findIRAnchors(const Function &F, AnchorMap &IRAnchors) { // For inlined code, recover the original callsite and callee by finding the @@ -300,6 +306,16 @@ void SampleProfileMatcher::runStaleProfileMatching( if (FilteredIRAnchorsList.empty() || FilteredProfileAnchorList.empty()) return; + if (FilteredIRAnchorsList.size() > SalvageStaleProfileMaxCallsites || + FilteredProfileAnchorList.size() > SalvageStaleProfileMaxCallsites) { + LLVM_DEBUG(dbgs() << "Skip stale profile matching for " << F.getName() + << " because the number of callsites in the IR is " + << FilteredIRAnchorsList.size() + << " and in the profile is " + << FilteredProfileAnchorList.size() << "\n"); + return; + } + // Match the callsite anchors by finding the longest common subsequence // between IR and profile. Note that we need to use IR anchor as base(A side) // to align with the order of IRToProfileLocationMap. diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 1b4c319032cab8..6a6bdba3fa70ad 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -291,10 +291,12 @@ static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombinerImpl &IC, uint32_t BitWidth = Ty->getScalarSizeInBits(); assert(BitWidth < OrigBitWidth && "Unexpected bitwidths!"); APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth); - if (IC.MaskedValueIsZero(I->getOperand(0), Mask, 0, CxtI) && - IC.MaskedValueIsZero(I->getOperand(1), Mask, 0, CxtI)) { - return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) && - canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI); + // Do not preserve the original context instruction. Simplifying div/rem + // based on later context may introduce a trap. + if (IC.MaskedValueIsZero(I->getOperand(0), Mask, 0, I) && + IC.MaskedValueIsZero(I->getOperand(1), Mask, 0, I)) { + return canEvaluateTruncated(I->getOperand(0), Ty, IC, I) && + canEvaluateTruncated(I->getOperand(1), Ty, IC, I); } break; } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 34b0f8b8604974..522c31f9e3e716 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -3130,6 +3130,13 @@ Instruction *InstCombinerImpl::foldICmpAddConstant(ICmpInst &Cmp, return new ICmpInst(ICmpInst::ICMP_EQ, Builder.CreateAnd(X, -C), ConstantExpr::getNeg(cast(Y))); + // X+C2 (X & C) == 2C + // iff C == -(C2) + // C2 is a power of 2 + if (Pred == ICmpInst::ICMP_ULT && C2->isPowerOf2() && C == -*C2) + return new ICmpInst(ICmpInst::ICMP_NE, Builder.CreateAnd(X, C), + ConstantInt::get(Ty, C * 2)); + // X+C >u C2 -> (X & ~C2) != C // iff C & C2 == 0 // C2+1 is a power of 2 diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 84fee55d187418..408083765ccb16 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2971,32 +2971,33 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, nullptr, Twine(Prefix) + "scalar.ph"); - auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); - // Set up the middle block terminator. Two cases: - // 1) If we know that we must execute the scalar epilogue, emit an - // unconditional branch. + // 1) If we know that we must execute the scalar epilogue, retain the existing + // unconditional branch from the middle block to the scalar preheader. In that + // case, there's no edge from the middle block to exit blocks and thus no + // need to update the immediate dominator of the exit blocks. + if (Cost->requiresScalarEpilogue(VF.isVector())) { + assert( + LoopMiddleBlock->getSingleSuccessor() == LoopScalarPreHeader && + " middle block should have the scalar preheader as single successor"); + return; + } + // 2) Otherwise, we must have a single unique exit block (due to how we // implement the multiple exit case). In this case, set up a conditional // branch from the middle block to the loop scalar preheader, and the // exit block. completeLoopSkeleton will update the condition to use an // iteration check, if required to decide whether to execute the remainder. BranchInst *BrInst = - Cost->requiresScalarEpilogue(VF.isVector()) - ? BranchInst::Create(LoopScalarPreHeader) - : BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, - Builder.getTrue()); + BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); + auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); // Update dominator for loop exit. During skeleton creation, only the vector // pre-header and the middle block are created. The vector loop is entirely // created during VPlan exection. - if (!Cost->requiresScalarEpilogue(VF.isVector())) - // If there is an epilogue which must run, there's no edge from the - // middle block to exit blocks and thus no need to update the immediate - // dominator of the exit blocks. - DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); + DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); } PHINode *InnerLoopVectorizer::createInductionResumeValue( diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index e608c7fb604681..5b9fe1c9c9854d 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -1669,33 +1669,34 @@ bool VectorCombine::foldShuffleOfShuffles(Instruction &I) { return true; } -using InstLane = std::pair; +using InstLane = std::pair; -static InstLane lookThroughShuffles(Value *V, int Lane) { - while (auto *SV = dyn_cast(V)) { +static InstLane lookThroughShuffles(Use *U, int Lane) { + while (auto *SV = dyn_cast(U->get())) { unsigned NumElts = cast(SV->getOperand(0)->getType())->getNumElements(); int M = SV->getMaskValue(Lane); if (M < 0) return {nullptr, PoisonMaskElem}; if (static_cast(M) < NumElts) { - V = SV->getOperand(0); + U = &SV->getOperandUse(0); Lane = M; } else { - V = SV->getOperand(1); + U = &SV->getOperandUse(1); Lane = M - NumElts; } } - return InstLane{V, Lane}; + return InstLane{U, Lane}; } static SmallVector generateInstLaneVectorFromOperand(ArrayRef Item, int Op) { SmallVector NItem; for (InstLane IL : Item) { - auto [V, Lane] = IL; + auto [U, Lane] = IL; InstLane OpLane = - V ? lookThroughShuffles(cast(V)->getOperand(Op), Lane) + U ? lookThroughShuffles(&cast(U->get())->getOperandUse(Op), + Lane) : InstLane{nullptr, PoisonMaskElem}; NItem.emplace_back(OpLane); } @@ -1703,29 +1704,24 @@ generateInstLaneVectorFromOperand(ArrayRef Item, int Op) { } static Value *generateNewInstTree(ArrayRef Item, FixedVectorType *Ty, - const SmallPtrSet &IdentityLeafs, - const SmallPtrSet &SplatLeafs, + const SmallPtrSet &IdentityLeafs, + const SmallPtrSet &SplatLeafs, IRBuilder<> &Builder) { - auto [FrontV, FrontLane] = Item.front(); - - if (IdentityLeafs.contains(FrontV) && - all_of(drop_begin(enumerate(Item)), [Item](const auto &E) { - Value *FrontV = Item.front().first; - auto [V, Lane] = E.value(); - return !V || (V == FrontV && Lane == (int)E.index()); - })) { - return FrontV; + auto [FrontU, FrontLane] = Item.front(); + + if (IdentityLeafs.contains(FrontU)) { + return FrontU->get(); } - if (SplatLeafs.contains(FrontV)) { - if (auto *ILI = dyn_cast(FrontV)) + if (SplatLeafs.contains(FrontU)) { + if (auto *ILI = dyn_cast(FrontU)) Builder.SetInsertPoint(*ILI->getInsertionPointAfterDef()); - else if (auto *Arg = dyn_cast(FrontV)) + else if (auto *Arg = dyn_cast(FrontU)) Builder.SetInsertPointPastAllocas(Arg->getParent()); SmallVector Mask(Ty->getNumElements(), FrontLane); - return Builder.CreateShuffleVector(FrontV, Mask); + return Builder.CreateShuffleVector(FrontU->get(), Mask); } - auto *I = cast(FrontV); + auto *I = cast(FrontU->get()); auto *II = dyn_cast(I); unsigned NumOps = I->getNumOperands() - (II ? 1 : 0); SmallVector Ops(NumOps); @@ -1741,7 +1737,7 @@ static Value *generateNewInstTree(ArrayRef Item, FixedVectorType *Ty, SmallVector ValueList; for (const auto &Lane : Item) if (Lane.first) - ValueList.push_back(Lane.first); + ValueList.push_back(Lane.first->get()); Builder.SetInsertPoint(I); Type *DstTy = @@ -1785,16 +1781,16 @@ static Value *generateNewInstTree(ArrayRef Item, FixedVectorType *Ty, // do so. bool VectorCombine::foldShuffleToIdentity(Instruction &I) { auto *Ty = dyn_cast(I.getType()); - if (!Ty) + if (!Ty || I.use_empty()) return false; SmallVector Start(Ty->getNumElements()); for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M) - Start[M] = lookThroughShuffles(&I, M); + Start[M] = lookThroughShuffles(&*I.use_begin(), M); SmallVector> Worklist; Worklist.push_back(Start); - SmallPtrSet IdentityLeafs, SplatLeafs; + SmallPtrSet IdentityLeafs, SplatLeafs; unsigned NumVisited = 0; while (!Worklist.empty()) { @@ -1802,52 +1798,52 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) { return false; SmallVector Item = Worklist.pop_back_val(); - auto [FrontV, FrontLane] = Item.front(); + auto [FrontU, FrontLane] = Item.front(); // If we found an undef first lane then bail out to keep things simple. - if (!FrontV) + if (!FrontU) return false; // Look for an identity value. - if (!FrontLane && - cast(FrontV->getType())->getNumElements() == + if (FrontLane == 0 && + cast(FrontU->get()->getType())->getNumElements() == Ty->getNumElements() && all_of(drop_begin(enumerate(Item)), [Item](const auto &E) { - Value *FrontV = Item.front().first; - return !E.value().first || (E.value().first == FrontV && + Value *FrontV = Item.front().first->get(); + return !E.value().first || (E.value().first->get() == FrontV && E.value().second == (int)E.index()); })) { - IdentityLeafs.insert(FrontV); + IdentityLeafs.insert(FrontU); continue; } // Look for constants, for the moment only supporting constant splats. - if (auto *C = dyn_cast(FrontV); + if (auto *C = dyn_cast(FrontU); C && C->getSplatValue() && all_of(drop_begin(Item), [Item](InstLane &IL) { - Value *FrontV = Item.front().first; - Value *V = IL.first; - return !V || V == FrontV; + Value *FrontV = Item.front().first->get(); + Use *U = IL.first; + return !U || U->get() == FrontV; })) { - SplatLeafs.insert(FrontV); + SplatLeafs.insert(FrontU); continue; } // Look for a splat value. if (all_of(drop_begin(Item), [Item](InstLane &IL) { - auto [FrontV, FrontLane] = Item.front(); - auto [V, Lane] = IL; - return !V || (V == FrontV && Lane == FrontLane); + auto [FrontU, FrontLane] = Item.front(); + auto [U, Lane] = IL; + return !U || (U->get() == FrontU->get() && Lane == FrontLane); })) { - SplatLeafs.insert(FrontV); + SplatLeafs.insert(FrontU); continue; } // We need each element to be the same type of value, and check that each // element has a single use. if (!all_of(drop_begin(Item), [Item](InstLane IL) { - Value *FrontV = Item.front().first; - Value *V = IL.first; - if (!V) + Value *FrontV = Item.front().first->get(); + if (!IL.first) return true; + Value *V = IL.first->get(); if (auto *I = dyn_cast(V); I && !I->hasOneUse()) return false; if (V->getValueID() != FrontV->getValueID()) @@ -1869,25 +1865,25 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) { // Check the operator is one that we support. We exclude div/rem in case // they hit UB from poison lanes. - if ((isa(FrontV) && - !cast(FrontV)->isIntDivRem()) || - isa(FrontV)) { + if ((isa(FrontU) && + !cast(FrontU)->isIntDivRem()) || + isa(FrontU)) { Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0)); Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1)); - } else if (isa(FrontV)) { + } else if (isa(FrontU)) { Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0)); - } else if (isa(FrontV)) { + } else if (isa(FrontU)) { Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0)); Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1)); Worklist.push_back(generateInstLaneVectorFromOperand(Item, 2)); - } else if (auto *II = dyn_cast(FrontV); + } else if (auto *II = dyn_cast(FrontU); II && isTriviallyVectorizable(II->getIntrinsicID())) { for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) { if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op)) { if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) { - Value *FrontV = Item.front().first; - Value *V = IL.first; - return !V || (cast(V)->getOperand(Op) == + Value *FrontV = Item.front().first->get(); + Use *U = IL.first; + return !U || (cast(U->get())->getOperand(Op) == cast(FrontV)->getOperand(Op)); })) return false; diff --git a/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll b/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll new file mode 100644 index 00000000000000..3e708b0678fbcd --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll @@ -0,0 +1,276 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -verify-machineinstrs < %s | FileCheck %s + +target triple = "aarch64-linux" + +; CHECK-LABEL: .LCPI0_0: +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 4 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 5 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 6 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 7 +; CHECK-NEXT: .LCPI0_1: +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 2 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 3 + +define void @sitofp_v8i8_to_v8f32(ptr %src, ptr %dst) { +; CHECK-LABEL: sitofp_v8i8_to_v8f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: adrp x9, .LCPI0_1 +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI0_1] +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: .LBB0_1: // %loop +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr d2, [x0, x8, lsl #3] +; CHECK-NEXT: add x9, x1, x8, lsl #5 +; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: cmp x8, #1000 +; CHECK-NEXT: tbl v3.16b, { v2.16b }, v0.16b +; CHECK-NEXT: tbl v2.16b, { v2.16b }, v1.16b +; CHECK-NEXT: scvtf v3.4s, v3.4s, #24 +; CHECK-NEXT: scvtf v2.4s, v2.4s, #24 +; CHECK-NEXT: stp q2, q3, [x9] +; CHECK-NEXT: b.eq .LBB0_1 +; CHECK-NEXT: // %bb.2: // %exit +; CHECK-NEXT: ret +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.src = getelementptr inbounds <8 x i8>, ptr %src, i64 %iv + %l = load <8 x i8>, ptr %gep.src + %conv = sitofp <8 x i8> %l to <8 x float> + %gep.dst = getelementptr inbounds <8 x float>, ptr %dst, i64 %iv + store <8 x float> %conv, ptr %gep.dst + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 1000 + br i1 %ec, label %loop, label %exit + +exit: + ret void +} + +; CHECK-LABEL: .LCPI1_0: +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 12 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 13 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 14 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 15 +; CHECK-NEXT: .LCPI1_1: +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 9 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 10 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 11 +; CHECK-NEXT: .LCPI1_2: +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 4 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 5 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 6 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 7 +; CHECK-NEXT: .LCPI1_3: +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 2 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 3 + +define void @sitofp_v16i8_to_v16f32(ptr %src, ptr %dst) { +; CHECK-LABEL: sitofp_v16i8_to_v16f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: adrp x8, .LCPI1_0 +; CHECK-NEXT: adrp x9, .LCPI1_1 +; CHECK-NEXT: adrp x10, .LCPI1_2 +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI1_0] +; CHECK-NEXT: adrp x8, .LCPI1_3 +; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI1_1] +; CHECK-NEXT: ldr q2, [x10, :lo12:.LCPI1_2] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI1_3] +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: .LBB1_1: // %loop +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr q4, [x0, x8, lsl #4] +; CHECK-NEXT: add x9, x1, x8, lsl #6 +; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: cmp x8, #1000 +; CHECK-NEXT: tbl v5.16b, { v4.16b }, v0.16b +; CHECK-NEXT: tbl v6.16b, { v4.16b }, v1.16b +; CHECK-NEXT: tbl v7.16b, { v4.16b }, v2.16b +; CHECK-NEXT: tbl v4.16b, { v4.16b }, v3.16b +; CHECK-NEXT: scvtf v5.4s, v5.4s, #24 +; CHECK-NEXT: scvtf v6.4s, v6.4s, #24 +; CHECK-NEXT: scvtf v7.4s, v7.4s, #24 +; CHECK-NEXT: scvtf v4.4s, v4.4s, #24 +; CHECK-NEXT: stp q6, q5, [x9, #32] +; CHECK-NEXT: stp q4, q7, [x9] +; CHECK-NEXT: b.eq .LBB1_1 +; CHECK-NEXT: // %bb.2: // %exit +; CHECK-NEXT: ret +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.src = getelementptr inbounds <16 x i8>, ptr %src, i64 %iv + %l = load <16 x i8>, ptr %gep.src + %conv = sitofp <16 x i8> %l to <16 x float> + %gep.dst = getelementptr inbounds <16 x float>, ptr %dst, i64 %iv + store <16 x float> %conv, ptr %gep.dst + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 1000 + br i1 %ec, label %loop, label %exit + +exit: + ret void +} + + +; Negative tests, currently we don't convert to f16/bf16 via `tbl`. +define void @sitofp_v8i8_to_v8f16(ptr %src, ptr %dst) { +; CHECK-LABEL: sitofp_v8i8_to_v8f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: .LBB2_1: // %loop +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr d0, [x0, x8, lsl #3] +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: sshll v1.4s, v0.4h, #0 +; CHECK-NEXT: sshll2 v0.4s, v0.8h, #0 +; CHECK-NEXT: scvtf v1.4s, v1.4s +; CHECK-NEXT: scvtf v0.4s, v0.4s +; CHECK-NEXT: fcvtn v1.4h, v1.4s +; CHECK-NEXT: fcvtn2 v1.8h, v0.4s +; CHECK-NEXT: str q1, [x1, x8, lsl #4] +; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: cmp x8, #1000 +; CHECK-NEXT: b.eq .LBB2_1 +; CHECK-NEXT: // %bb.2: // %exit +; CHECK-NEXT: ret +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.src = getelementptr inbounds <8 x i8>, ptr %src, i64 %iv + %l = load <8 x i8>, ptr %gep.src + %conv = sitofp <8 x i8> %l to <8 x half> + %gep.dst = getelementptr inbounds <8 x half>, ptr %dst, i64 %iv + store <8 x half> %conv, ptr %gep.dst + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 1000 + br i1 %ec, label %loop, label %exit + +exit: + ret void +} + + +; Negative test, conversion to double with the help of `tbl` not implemented (TODO) +define void @sitofp_v2i8_to_v2f64(ptr %src, ptr %dst) { +; CHECK-LABEL: sitofp_v2i8_to_v2f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: .LBB3_1: // %loop +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add x9, x0, x8, lsl #1 +; CHECK-NEXT: ldrsb w10, [x9] +; CHECK-NEXT: ldrsb w9, [x9, #1] +; CHECK-NEXT: fmov s0, w10 +; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: sshll v0.2d, v0.2s, #0 +; CHECK-NEXT: scvtf v0.2d, v0.2d +; CHECK-NEXT: str q0, [x1, x8, lsl #4] +; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: cmp x8, #1000 +; CHECK-NEXT: b.eq .LBB3_1 +; CHECK-NEXT: // %bb.2: // %exit +; CHECK-NEXT: ret +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.src = getelementptr inbounds <2 x i8>, ptr %src, i64 %iv + %l = load <2 x i8>, ptr %gep.src + %conv = sitofp <2 x i8> %l to <2 x double> + %gep.dst = getelementptr inbounds <2 x double>, ptr %dst, i64 %iv + store <2 x double> %conv, ptr %gep.dst + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 1000 + br i1 %ec, label %loop, label %exit + +exit: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll index 4e21ef83793429..fff341b07881b2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll @@ -237,24 +237,10 @@ define <2 x half> @global_atomic_fadd_ret_v2f16_agent_offset(ptr addrspace(1) %p ; GFX940-LABEL: global_atomic_fadd_ret_v2f16_agent_offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:1024 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0 +; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:1024 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256 %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst @@ -265,23 +251,10 @@ define void @global_atomic_fadd_noret_v2f16_agent_offset(ptr addrspace(1) %ptr, ; GFX940-LABEL: global_atomic_fadd_noret_v2f16_agent_offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:1024 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0 +; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:1024 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256 %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst @@ -292,24 +265,10 @@ define <2 x half> @flat_atomic_fadd_ret_v2f16_agent_offset(ptr %ptr, <2 x half> ; GFX940-LABEL: flat_atomic_fadd_ret_v2f16_agent_offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:1024 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:1024 sc0 +; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:1024 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB19_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr %ptr, i32 256 %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst @@ -320,23 +279,10 @@ define void @flat_atomic_fadd_noret_v2f16_agent_offset(ptr %ptr, <2 x half> %val ; GFX940-LABEL: flat_atomic_fadd_noret_v2f16_agent_offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:1024 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:1024 sc0 +; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:1024 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB20_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr %ptr, i32 256 %unused = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index cdfb71b9bf6b64..bc2a23966c9cac 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -10293,49 +10293,19 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB42_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB42_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB42_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16: @@ -10510,49 +10480,19 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB43_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB43_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB43_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos: @@ -10730,56 +10670,19 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB44_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX940-NEXT: flat_load_dword v0, v[0:1] -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_pk_add_f16 v0, v1, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 +; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:63488 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB44_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg: @@ -10968,46 +10871,19 @@ define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) #0 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB45_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB45_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16: @@ -11174,46 +11050,19 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB46_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB46_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos: @@ -11387,52 +11236,19 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:-2048 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB47_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v5, v[4:5] -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:63488 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB47_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg: @@ -11618,49 +11434,19 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB48_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB48_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos: @@ -11840,46 +11626,19 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB49_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB49_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos: @@ -12059,88 +11818,19 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB50_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0 +; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB50_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16: @@ -12403,88 +12093,19 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB51_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 +; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB51_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos: @@ -12750,95 +12371,19 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB52_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX940-NEXT: flat_load_dword v0, v[0:1] -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX940-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v6, v6, v0, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v3, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v3, v0, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 +; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:63488 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB52_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg: @@ -13115,85 +12660,19 @@ define void @flat_agent_atomic_fadd_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB53_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB53_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16: @@ -13448,85 +12927,19 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB54_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 +; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB54_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos: @@ -13788,91 +13201,19 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:-2048 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB55_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v3, v[4:5] -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:63488 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB55_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg: @@ -14146,88 +13487,19 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB56_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1 +; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB56_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos: @@ -14495,85 +13767,19 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB57_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 +; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB57_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 5f60c8ac2b3b71..77c8e034e68a0a 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -11207,49 +11207,19 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB42_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB42_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB42_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16: @@ -11305,23 +11275,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16: @@ -11481,49 +11437,19 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrspa ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB43_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB43_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB43_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos: @@ -11579,23 +11505,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrspa ; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB43_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos: @@ -11757,49 +11669,19 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg(ptr addrspa ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB44_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB44_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg: @@ -11855,23 +11737,9 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg(ptr addrspa ; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg: @@ -12037,46 +11905,19 @@ define void @global_agent_atomic_fadd_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB45_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB45_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16: @@ -12129,22 +11970,9 @@ define void @global_agent_atomic_fadd_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB45_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16: @@ -12298,46 +12126,19 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB46_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB46_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos: @@ -12390,22 +12191,9 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos: @@ -12562,46 +12350,19 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:-2048 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB47_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-2048 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB47_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg: @@ -12654,22 +12415,9 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg: @@ -12834,49 +12582,19 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrsp ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB48_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB48_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos: @@ -12932,25 +12650,11 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrsp ; GFX90A-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos: @@ -13112,46 +12816,19 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB49_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2044 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB49_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos: @@ -13204,24 +12881,11 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX90A-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos: @@ -13382,88 +13046,19 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB50_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB50_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16: @@ -13779,88 +13374,19 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos(ptr addr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB51_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB51_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos: @@ -14178,88 +13704,19 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB52_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB52_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg: @@ -14581,85 +14038,19 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB53_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB53_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16: @@ -14965,85 +14356,19 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB54_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 +; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB54_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos: @@ -15352,85 +14677,19 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB55_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-2048 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 +; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB55_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg: @@ -15747,88 +15006,19 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB56_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 sc0 sc1 +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB56_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos: @@ -16148,85 +15338,19 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB57_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2044 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 +; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB57_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ptr.buffer.atomic.fadd_rtn_errors.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ptr.buffer.atomic.fadd_rtn_errors.ll index ee3397d01c4eea..a3b83c346c1bb5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ptr.buffer.atomic.fadd_rtn_errors.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ptr.buffer.atomic.fadd_rtn_errors.ll @@ -4,14 +4,27 @@ ; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/struct-ret-f32-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-F32-SDAG %s ; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/raw-ret-v2f16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2F16-SDAG %s ; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/struct-ret-v2f16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2F16-SDAG %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-SDAG %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-SDAG %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-SDAG %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-SDAG %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-SDAG %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-SDAG %s ; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/raw-ret-f32-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-F32-GISEL %s ; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/struct-ret-f32-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-F32-GISEL %s ; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/raw-ret-v2f16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2F16-GISEL %s ; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/struct-ret-v2f16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2F16-GISEL %s +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-GISEL %s +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-GISEL %s +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-GISEL %s +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-GISEL %s +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-GISEL %s +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-GISEL %s ; Make sure buffer fadd atomics with return values are not selected ; for gfx908 where they do not work. +; Check bf16 buffer fadd does not select on supported subtargets. ;--- raw-ret-f32-error.ll ; ERR-RAW-F32-SDAG: LLVM ERROR: Cannot select: {{.+}}: f32,ch = BUFFER_ATOMIC_FADD @@ -49,4 +62,20 @@ define <2 x half> @struct_ptr_buffer_atomic_fadd_v2f16_rtn(<2 x half> %val, ptr ret <2 x half> %ret } +;--- raw-ret-v2bf16-error.ll +; ERR-RAW-V2BF16-SDAG: LLVM ERROR: Cannot select: {{.+}}: v2bf16,ch = BUFFER_ATOMIC_FADD +; ERR-RAW-V2BF16-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(<2 x s16>) = G_AMDGPU_BUFFER_ATOMIC_FADD +define <2 x bfloat> @raw_ptr_buffer_atomic_fadd_v2bf16_rtn(<2 x bfloat> %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { + %ret = call <2 x bfloat> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) + ret <2 x bfloat> %ret +} + +;--- struct-ret-v2bf16-error.ll +; ERR-STRUCT-V2BF16-SDAG: LLVM ERROR: Cannot select: {{.+}}: v2bf16,ch = BUFFER_ATOMIC_FADD +; ERR-STRUCT-V2BF16-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(<2 x s16>) = G_AMDGPU_BUFFER_ATOMIC_FADD + +define <2 x bfloat> @struct_ptr_buffer_atomic_fadd_v2bf16_rtn(<2 x bfloat> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { + %ret = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + ret <2 x bfloat> %ret +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll new file mode 100644 index 00000000000000..e0e4f950cc16c2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll @@ -0,0 +1,116 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; FIXME: Check gfx90a, 940. 908 should fail to select. +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200 %s + +define <2 x bfloat> @struct_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x bfloat> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) #0 { +; GFX1200-LABEL: struct_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-NEXT: s_wait_expcnt 0x0 +; GFX1200-NEXT: s_wait_samplecnt 0x0 +; GFX1200-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: s_setpc_b64 s[30:31] + %ret = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + ret <2 x bfloat> %ret +} + +define void @struct_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x bfloat> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) #0 { +; GFX1200-LABEL: struct_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-NEXT: s_wait_expcnt 0x0 +; GFX1200-NEXT: s_wait_samplecnt 0x0 +; GFX1200-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen +; GFX1200-NEXT: s_setpc_b64 s[30:31] + %unused = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +; Test waterfall loop +define <2 x bfloat> @struct_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) #0 { +; GFX1200-LABEL: struct_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-NEXT: s_wait_expcnt 0x0 +; GFX1200-NEXT: s_wait_samplecnt 0x0 +; GFX1200-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: s_mov_b32 s2, exec_lo +; GFX1200-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 +; GFX1200-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1200-NEXT: v_readfirstlane_b32 s5, v2 +; GFX1200-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1200-NEXT: v_readfirstlane_b32 s7, v4 +; GFX1200-NEXT: v_readfirstlane_b32 s3, v7 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] +; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 +; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1200-NEXT: s_and_b32 s0, s0, s1 +; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1200-NEXT: s_and_saveexec_b32 s0, s0 +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[5:6], s[4:7], s3 idxen offen th:TH_ATOMIC_RETURN +; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 +; GFX1200-NEXT: ; implicit-def: $vgpr7 +; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6 +; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1200-NEXT: s_cbranch_execnz .LBB2_1 +; GFX1200-NEXT: ; %bb.2: +; GFX1200-NEXT: s_mov_b32 exec_lo, s2 +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: s_setpc_b64 s[30:31] + %ret = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + ret <2 x bfloat> %ret +} + +define void @struct_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) #0 { +; GFX1200-LABEL: struct_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-NEXT: s_wait_expcnt 0x0 +; GFX1200-NEXT: s_wait_samplecnt 0x0 +; GFX1200-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: s_mov_b32 s2, exec_lo +; GFX1200-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX1200-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1200-NEXT: v_readfirstlane_b32 s5, v2 +; GFX1200-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1200-NEXT: v_readfirstlane_b32 s7, v4 +; GFX1200-NEXT: v_readfirstlane_b32 s3, v7 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] +; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 +; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1200-NEXT: s_and_b32 s0, s0, s1 +; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1200-NEXT: s_and_saveexec_b32 s0, s0 +; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[5:6], s[4:7], s3 idxen offen +; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 +; GFX1200-NEXT: ; implicit-def: $vgpr7 +; GFX1200-NEXT: ; implicit-def: $vgpr0 +; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6 +; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1200-NEXT: s_cbranch_execnz .LBB3_1 +; GFX1200-NEXT: ; %bb.2: +; GFX1200-NEXT: s_mov_b32 exec_lo, s2 +; GFX1200-NEXT: s_setpc_b64 s[30:31] + %ret = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +declare <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat>, ptr addrspace(8), i32, i32, i32, i32 immarg) + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-expandload-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-expandload-fp.ll index 48e820243c9578..8b31166e313deb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-expandload-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-expandload-fp.ll @@ -44,15 +44,16 @@ define <2 x half> @expandload_v2f16(ptr %base, <2 x half> %src0, <2 x i1> %mask) ; RV32-NEXT: ret ; RV32-NEXT: .LBB1_3: # %cond.load ; RV32-NEXT: flh fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 2, e16, m2, tu, ma +; RV32-NEXT: vsetvli zero, zero, e16, m2, tu, ma ; RV32-NEXT: vfmv.s.f v8, fa5 ; RV32-NEXT: addi a0, a0, 2 ; RV32-NEXT: andi a1, a1, 2 ; RV32-NEXT: beqz a1, .LBB1_2 ; RV32-NEXT: .LBB1_4: # %cond.load1 ; RV32-NEXT: flh fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV32-NEXT: vfmv.s.f v9, fa5 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; RV32-NEXT: vslideup.vi v8, v9, 1 ; RV32-NEXT: ret ; @@ -69,15 +70,16 @@ define <2 x half> @expandload_v2f16(ptr %base, <2 x half> %src0, <2 x i1> %mask) ; RV64-NEXT: ret ; RV64-NEXT: .LBB1_3: # %cond.load ; RV64-NEXT: flh fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 2, e16, m2, tu, ma +; RV64-NEXT: vsetvli zero, zero, e16, m2, tu, ma ; RV64-NEXT: vfmv.s.f v8, fa5 ; RV64-NEXT: addi a0, a0, 2 ; RV64-NEXT: andi a1, a1, 2 ; RV64-NEXT: beqz a1, .LBB1_2 ; RV64-NEXT: .LBB1_4: # %cond.load1 ; RV64-NEXT: flh fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64-NEXT: vfmv.s.f v9, fa5 +; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; RV64-NEXT: vslideup.vi v8, v9, 1 ; RV64-NEXT: ret %res = call <2 x half> @llvm.masked.expandload.v2f16(ptr align 2 %base, <2 x i1> %mask, <2 x half> %src0) @@ -105,15 +107,16 @@ define <4 x half> @expandload_v4f16(ptr %base, <4 x half> %src0, <4 x i1> %mask) ; RV32-NEXT: ret ; RV32-NEXT: .LBB2_5: # %cond.load ; RV32-NEXT: flh fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 4, e16, m2, tu, ma +; RV32-NEXT: vsetvli zero, zero, e16, m2, tu, ma ; RV32-NEXT: vfmv.s.f v8, fa5 ; RV32-NEXT: addi a0, a0, 2 ; RV32-NEXT: andi a2, a1, 2 ; RV32-NEXT: beqz a2, .LBB2_2 ; RV32-NEXT: .LBB2_6: # %cond.load1 ; RV32-NEXT: flh fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 2, e16, mf2, tu, ma +; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV32-NEXT: vfmv.s.f v9, fa5 +; RV32-NEXT: vsetivli zero, 2, e16, mf2, tu, ma ; RV32-NEXT: vslideup.vi v8, v9, 1 ; RV32-NEXT: addi a0, a0, 2 ; RV32-NEXT: andi a2, a1, 4 @@ -152,15 +155,16 @@ define <4 x half> @expandload_v4f16(ptr %base, <4 x half> %src0, <4 x i1> %mask) ; RV64-NEXT: ret ; RV64-NEXT: .LBB2_5: # %cond.load ; RV64-NEXT: flh fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 4, e16, m2, tu, ma +; RV64-NEXT: vsetvli zero, zero, e16, m2, tu, ma ; RV64-NEXT: vfmv.s.f v8, fa5 ; RV64-NEXT: addi a0, a0, 2 ; RV64-NEXT: andi a2, a1, 2 ; RV64-NEXT: beqz a2, .LBB2_2 ; RV64-NEXT: .LBB2_6: # %cond.load1 ; RV64-NEXT: flh fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 2, e16, mf2, tu, ma +; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64-NEXT: vfmv.s.f v9, fa5 +; RV64-NEXT: vsetivli zero, 2, e16, mf2, tu, ma ; RV64-NEXT: vslideup.vi v8, v9, 1 ; RV64-NEXT: addi a0, a0, 2 ; RV64-NEXT: andi a2, a1, 4 @@ -216,15 +220,16 @@ define <8 x half> @expandload_v8f16(ptr %base, <8 x half> %src0, <8 x i1> %mask) ; RV32-NEXT: ret ; RV32-NEXT: .LBB3_9: # %cond.load ; RV32-NEXT: flh fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 8, e16, m2, tu, ma +; RV32-NEXT: vsetvli zero, zero, e16, m2, tu, ma ; RV32-NEXT: vfmv.s.f v8, fa5 ; RV32-NEXT: addi a0, a0, 2 ; RV32-NEXT: andi a2, a1, 2 ; RV32-NEXT: beqz a2, .LBB3_2 ; RV32-NEXT: .LBB3_10: # %cond.load1 ; RV32-NEXT: flh fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 2, e16, m1, tu, ma +; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV32-NEXT: vfmv.s.f v9, fa5 +; RV32-NEXT: vsetivli zero, 2, e16, m1, tu, ma ; RV32-NEXT: vslideup.vi v8, v9, 1 ; RV32-NEXT: addi a0, a0, 2 ; RV32-NEXT: andi a2, a1, 4 @@ -307,15 +312,16 @@ define <8 x half> @expandload_v8f16(ptr %base, <8 x half> %src0, <8 x i1> %mask) ; RV64-NEXT: ret ; RV64-NEXT: .LBB3_9: # %cond.load ; RV64-NEXT: flh fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 8, e16, m2, tu, ma +; RV64-NEXT: vsetvli zero, zero, e16, m2, tu, ma ; RV64-NEXT: vfmv.s.f v8, fa5 ; RV64-NEXT: addi a0, a0, 2 ; RV64-NEXT: andi a2, a1, 2 ; RV64-NEXT: beqz a2, .LBB3_2 ; RV64-NEXT: .LBB3_10: # %cond.load1 ; RV64-NEXT: flh fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 2, e16, m1, tu, ma +; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64-NEXT: vfmv.s.f v9, fa5 +; RV64-NEXT: vsetivli zero, 2, e16, m1, tu, ma ; RV64-NEXT: vslideup.vi v8, v9, 1 ; RV64-NEXT: addi a0, a0, 2 ; RV64-NEXT: andi a2, a1, 4 @@ -412,15 +418,16 @@ define <2 x float> @expandload_v2f32(ptr %base, <2 x float> %src0, <2 x i1> %mas ; RV32-NEXT: ret ; RV32-NEXT: .LBB5_3: # %cond.load ; RV32-NEXT: flw fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 2, e32, m4, tu, ma +; RV32-NEXT: vsetvli zero, zero, e32, m4, tu, ma ; RV32-NEXT: vfmv.s.f v8, fa5 ; RV32-NEXT: addi a0, a0, 4 ; RV32-NEXT: andi a1, a1, 2 ; RV32-NEXT: beqz a1, .LBB5_2 ; RV32-NEXT: .LBB5_4: # %cond.load1 ; RV32-NEXT: flw fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; RV32-NEXT: vfmv.s.f v9, fa5 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV32-NEXT: vslideup.vi v8, v9, 1 ; RV32-NEXT: ret ; @@ -437,15 +444,16 @@ define <2 x float> @expandload_v2f32(ptr %base, <2 x float> %src0, <2 x i1> %mas ; RV64-NEXT: ret ; RV64-NEXT: .LBB5_3: # %cond.load ; RV64-NEXT: flw fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 2, e32, m4, tu, ma +; RV64-NEXT: vsetvli zero, zero, e32, m4, tu, ma ; RV64-NEXT: vfmv.s.f v8, fa5 ; RV64-NEXT: addi a0, a0, 4 ; RV64-NEXT: andi a1, a1, 2 ; RV64-NEXT: beqz a1, .LBB5_2 ; RV64-NEXT: .LBB5_4: # %cond.load1 ; RV64-NEXT: flw fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; RV64-NEXT: vfmv.s.f v9, fa5 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV64-NEXT: vslideup.vi v8, v9, 1 ; RV64-NEXT: ret %res = call <2 x float> @llvm.masked.expandload.v2f32(ptr align 4 %base, <2 x i1> %mask, <2 x float> %src0) @@ -473,15 +481,16 @@ define <4 x float> @expandload_v4f32(ptr %base, <4 x float> %src0, <4 x i1> %mas ; RV32-NEXT: ret ; RV32-NEXT: .LBB6_5: # %cond.load ; RV32-NEXT: flw fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 4, e32, m4, tu, ma +; RV32-NEXT: vsetvli zero, zero, e32, m4, tu, ma ; RV32-NEXT: vfmv.s.f v8, fa5 ; RV32-NEXT: addi a0, a0, 4 ; RV32-NEXT: andi a2, a1, 2 ; RV32-NEXT: beqz a2, .LBB6_2 ; RV32-NEXT: .LBB6_6: # %cond.load1 ; RV32-NEXT: flw fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 2, e32, m1, tu, ma +; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; RV32-NEXT: vfmv.s.f v9, fa5 +; RV32-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; RV32-NEXT: vslideup.vi v8, v9, 1 ; RV32-NEXT: addi a0, a0, 4 ; RV32-NEXT: andi a2, a1, 4 @@ -520,15 +529,16 @@ define <4 x float> @expandload_v4f32(ptr %base, <4 x float> %src0, <4 x i1> %mas ; RV64-NEXT: ret ; RV64-NEXT: .LBB6_5: # %cond.load ; RV64-NEXT: flw fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 4, e32, m4, tu, ma +; RV64-NEXT: vsetvli zero, zero, e32, m4, tu, ma ; RV64-NEXT: vfmv.s.f v8, fa5 ; RV64-NEXT: addi a0, a0, 4 ; RV64-NEXT: andi a2, a1, 2 ; RV64-NEXT: beqz a2, .LBB6_2 ; RV64-NEXT: .LBB6_6: # %cond.load1 ; RV64-NEXT: flw fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 2, e32, m1, tu, ma +; RV64-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; RV64-NEXT: vfmv.s.f v9, fa5 +; RV64-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; RV64-NEXT: vslideup.vi v8, v9, 1 ; RV64-NEXT: addi a0, a0, 4 ; RV64-NEXT: andi a2, a1, 4 @@ -584,15 +594,16 @@ define <8 x float> @expandload_v8f32(ptr %base, <8 x float> %src0, <8 x i1> %mas ; RV32-NEXT: ret ; RV32-NEXT: .LBB7_9: # %cond.load ; RV32-NEXT: flw fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 8, e32, m4, tu, ma +; RV32-NEXT: vsetvli zero, zero, e32, m4, tu, ma ; RV32-NEXT: vfmv.s.f v8, fa5 ; RV32-NEXT: addi a0, a0, 4 ; RV32-NEXT: andi a2, a1, 2 ; RV32-NEXT: beqz a2, .LBB7_2 ; RV32-NEXT: .LBB7_10: # %cond.load1 ; RV32-NEXT: flw fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 2, e32, m1, tu, ma +; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; RV32-NEXT: vfmv.s.f v10, fa5 +; RV32-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; RV32-NEXT: vslideup.vi v8, v10, 1 ; RV32-NEXT: addi a0, a0, 4 ; RV32-NEXT: andi a2, a1, 4 @@ -675,15 +686,16 @@ define <8 x float> @expandload_v8f32(ptr %base, <8 x float> %src0, <8 x i1> %mas ; RV64-NEXT: ret ; RV64-NEXT: .LBB7_9: # %cond.load ; RV64-NEXT: flw fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 8, e32, m4, tu, ma +; RV64-NEXT: vsetvli zero, zero, e32, m4, tu, ma ; RV64-NEXT: vfmv.s.f v8, fa5 ; RV64-NEXT: addi a0, a0, 4 ; RV64-NEXT: andi a2, a1, 2 ; RV64-NEXT: beqz a2, .LBB7_2 ; RV64-NEXT: .LBB7_10: # %cond.load1 ; RV64-NEXT: flw fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 2, e32, m1, tu, ma +; RV64-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; RV64-NEXT: vfmv.s.f v10, fa5 +; RV64-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; RV64-NEXT: vslideup.vi v8, v10, 1 ; RV64-NEXT: addi a0, a0, 4 ; RV64-NEXT: andi a2, a1, 4 @@ -780,15 +792,16 @@ define <2 x double> @expandload_v2f64(ptr %base, <2 x double> %src0, <2 x i1> %m ; RV32-NEXT: ret ; RV32-NEXT: .LBB9_3: # %cond.load ; RV32-NEXT: fld fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 2, e64, m8, tu, ma +; RV32-NEXT: vsetvli zero, zero, e64, m8, tu, ma ; RV32-NEXT: vfmv.s.f v8, fa5 ; RV32-NEXT: addi a0, a0, 8 ; RV32-NEXT: andi a1, a1, 2 ; RV32-NEXT: beqz a1, .LBB9_2 ; RV32-NEXT: .LBB9_4: # %cond.load1 ; RV32-NEXT: fld fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; RV32-NEXT: vfmv.s.f v9, fa5 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vslideup.vi v8, v9, 1 ; RV32-NEXT: ret ; @@ -805,15 +818,16 @@ define <2 x double> @expandload_v2f64(ptr %base, <2 x double> %src0, <2 x i1> %m ; RV64-NEXT: ret ; RV64-NEXT: .LBB9_3: # %cond.load ; RV64-NEXT: fld fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 2, e64, m8, tu, ma +; RV64-NEXT: vsetvli zero, zero, e64, m8, tu, ma ; RV64-NEXT: vfmv.s.f v8, fa5 ; RV64-NEXT: addi a0, a0, 8 ; RV64-NEXT: andi a1, a1, 2 ; RV64-NEXT: beqz a1, .LBB9_2 ; RV64-NEXT: .LBB9_4: # %cond.load1 ; RV64-NEXT: fld fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; RV64-NEXT: vfmv.s.f v9, fa5 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vslideup.vi v8, v9, 1 ; RV64-NEXT: ret %res = call <2 x double> @llvm.masked.expandload.v2f64(ptr align 8 %base, <2 x i1> %mask, <2 x double> %src0) @@ -841,15 +855,16 @@ define <4 x double> @expandload_v4f64(ptr %base, <4 x double> %src0, <4 x i1> %m ; RV32-NEXT: ret ; RV32-NEXT: .LBB10_5: # %cond.load ; RV32-NEXT: fld fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 4, e64, m8, tu, ma +; RV32-NEXT: vsetvli zero, zero, e64, m8, tu, ma ; RV32-NEXT: vfmv.s.f v8, fa5 ; RV32-NEXT: addi a0, a0, 8 ; RV32-NEXT: andi a2, a1, 2 ; RV32-NEXT: beqz a2, .LBB10_2 ; RV32-NEXT: .LBB10_6: # %cond.load1 ; RV32-NEXT: fld fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 2, e64, m1, tu, ma +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; RV32-NEXT: vfmv.s.f v10, fa5 +; RV32-NEXT: vsetivli zero, 2, e64, m1, tu, ma ; RV32-NEXT: vslideup.vi v8, v10, 1 ; RV32-NEXT: addi a0, a0, 8 ; RV32-NEXT: andi a2, a1, 4 @@ -888,15 +903,16 @@ define <4 x double> @expandload_v4f64(ptr %base, <4 x double> %src0, <4 x i1> %m ; RV64-NEXT: ret ; RV64-NEXT: .LBB10_5: # %cond.load ; RV64-NEXT: fld fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 4, e64, m8, tu, ma +; RV64-NEXT: vsetvli zero, zero, e64, m8, tu, ma ; RV64-NEXT: vfmv.s.f v8, fa5 ; RV64-NEXT: addi a0, a0, 8 ; RV64-NEXT: andi a2, a1, 2 ; RV64-NEXT: beqz a2, .LBB10_2 ; RV64-NEXT: .LBB10_6: # %cond.load1 ; RV64-NEXT: fld fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 2, e64, m1, tu, ma +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; RV64-NEXT: vfmv.s.f v10, fa5 +; RV64-NEXT: vsetivli zero, 2, e64, m1, tu, ma ; RV64-NEXT: vslideup.vi v8, v10, 1 ; RV64-NEXT: addi a0, a0, 8 ; RV64-NEXT: andi a2, a1, 4 @@ -952,15 +968,16 @@ define <8 x double> @expandload_v8f64(ptr %base, <8 x double> %src0, <8 x i1> %m ; RV32-NEXT: ret ; RV32-NEXT: .LBB11_9: # %cond.load ; RV32-NEXT: fld fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 8, e64, m8, tu, ma +; RV32-NEXT: vsetvli zero, zero, e64, m8, tu, ma ; RV32-NEXT: vfmv.s.f v8, fa5 ; RV32-NEXT: addi a0, a0, 8 ; RV32-NEXT: andi a2, a1, 2 ; RV32-NEXT: beqz a2, .LBB11_2 ; RV32-NEXT: .LBB11_10: # %cond.load1 ; RV32-NEXT: fld fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 2, e64, m1, tu, ma +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; RV32-NEXT: vfmv.s.f v12, fa5 +; RV32-NEXT: vsetivli zero, 2, e64, m1, tu, ma ; RV32-NEXT: vslideup.vi v8, v12, 1 ; RV32-NEXT: addi a0, a0, 8 ; RV32-NEXT: andi a2, a1, 4 @@ -1043,15 +1060,16 @@ define <8 x double> @expandload_v8f64(ptr %base, <8 x double> %src0, <8 x i1> %m ; RV64-NEXT: ret ; RV64-NEXT: .LBB11_9: # %cond.load ; RV64-NEXT: fld fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 8, e64, m8, tu, ma +; RV64-NEXT: vsetvli zero, zero, e64, m8, tu, ma ; RV64-NEXT: vfmv.s.f v8, fa5 ; RV64-NEXT: addi a0, a0, 8 ; RV64-NEXT: andi a2, a1, 2 ; RV64-NEXT: beqz a2, .LBB11_2 ; RV64-NEXT: .LBB11_10: # %cond.load1 ; RV64-NEXT: fld fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 2, e64, m1, tu, ma +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; RV64-NEXT: vfmv.s.f v12, fa5 +; RV64-NEXT: vsetivli zero, 2, e64, m1, tu, ma ; RV64-NEXT: vslideup.vi v8, v12, 1 ; RV64-NEXT: addi a0, a0, 8 ; RV64-NEXT: andi a2, a1, 4 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-expandload-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-expandload-int.ll index d6aca55fbde59d..5bf8b07efc1daf 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-expandload-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-expandload-int.ll @@ -33,15 +33,16 @@ define <2 x i8> @expandload_v2i8(ptr %base, <2 x i8> %src0, <2 x i1> %mask) { ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB1_3: # %cond.load ; CHECK-NEXT: lbu a2, 0(a0) -; CHECK-NEXT: vsetivli zero, 2, e8, m1, tu, ma +; CHECK-NEXT: vsetvli zero, zero, e8, m1, tu, ma ; CHECK-NEXT: vmv.s.x v8, a2 ; CHECK-NEXT: addi a0, a0, 1 ; CHECK-NEXT: andi a1, a1, 2 ; CHECK-NEXT: beqz a1, .LBB1_2 ; CHECK-NEXT: .LBB1_4: # %cond.load1 ; CHECK-NEXT: lbu a0, 0(a0) -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; CHECK-NEXT: vslideup.vi v8, v9, 1 ; CHECK-NEXT: ret %res = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %base, <2 x i1> %mask, <2 x i8> %src0) @@ -69,15 +70,16 @@ define <4 x i8> @expandload_v4i8(ptr %base, <4 x i8> %src0, <4 x i1> %mask) { ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB2_5: # %cond.load ; CHECK-NEXT: lbu a2, 0(a0) -; CHECK-NEXT: vsetivli zero, 4, e8, m1, tu, ma +; CHECK-NEXT: vsetvli zero, zero, e8, m1, tu, ma ; CHECK-NEXT: vmv.s.x v8, a2 ; CHECK-NEXT: addi a0, a0, 1 ; CHECK-NEXT: andi a2, a1, 2 ; CHECK-NEXT: beqz a2, .LBB2_2 ; CHECK-NEXT: .LBB2_6: # %cond.load1 ; CHECK-NEXT: lbu a2, 0(a0) -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, tu, ma +; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; CHECK-NEXT: vmv.s.x v9, a2 +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, tu, ma ; CHECK-NEXT: vslideup.vi v8, v9, 1 ; CHECK-NEXT: addi a0, a0, 1 ; CHECK-NEXT: andi a2, a1, 4 @@ -133,15 +135,16 @@ define <8 x i8> @expandload_v8i8(ptr %base, <8 x i8> %src0, <8 x i1> %mask) { ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB3_9: # %cond.load ; CHECK-NEXT: lbu a2, 0(a0) -; CHECK-NEXT: vsetivli zero, 8, e8, m1, tu, ma +; CHECK-NEXT: vsetvli zero, zero, e8, m1, tu, ma ; CHECK-NEXT: vmv.s.x v8, a2 ; CHECK-NEXT: addi a0, a0, 1 ; CHECK-NEXT: andi a2, a1, 2 ; CHECK-NEXT: beqz a2, .LBB3_2 ; CHECK-NEXT: .LBB3_10: # %cond.load1 ; CHECK-NEXT: lbu a2, 0(a0) -; CHECK-NEXT: vsetivli zero, 2, e8, mf2, tu, ma +; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; CHECK-NEXT: vmv.s.x v9, a2 +; CHECK-NEXT: vsetivli zero, 2, e8, mf2, tu, ma ; CHECK-NEXT: vslideup.vi v8, v9, 1 ; CHECK-NEXT: addi a0, a0, 1 ; CHECK-NEXT: andi a2, a1, 4 @@ -227,15 +230,16 @@ define <2 x i16> @expandload_v2i16(ptr %base, <2 x i16> %src0, <2 x i1> %mask) { ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB5_3: # %cond.load ; CHECK-NEXT: lh a2, 0(a0) -; CHECK-NEXT: vsetivli zero, 2, e16, m2, tu, ma +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma ; CHECK-NEXT: vmv.s.x v8, a2 ; CHECK-NEXT: addi a0, a0, 2 ; CHECK-NEXT: andi a1, a1, 2 ; CHECK-NEXT: beqz a1, .LBB5_2 ; CHECK-NEXT: .LBB5_4: # %cond.load1 ; CHECK-NEXT: lh a0, 0(a0) -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vslideup.vi v8, v9, 1 ; CHECK-NEXT: ret %res = call <2 x i16> @llvm.masked.expandload.v2i16(ptr align 2 %base, <2 x i1> %mask, <2 x i16> %src0) @@ -263,15 +267,16 @@ define <4 x i16> @expandload_v4i16(ptr %base, <4 x i16> %src0, <4 x i1> %mask) { ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB6_5: # %cond.load ; CHECK-NEXT: lh a2, 0(a0) -; CHECK-NEXT: vsetivli zero, 4, e16, m2, tu, ma +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma ; CHECK-NEXT: vmv.s.x v8, a2 ; CHECK-NEXT: addi a0, a0, 2 ; CHECK-NEXT: andi a2, a1, 2 ; CHECK-NEXT: beqz a2, .LBB6_2 ; CHECK-NEXT: .LBB6_6: # %cond.load1 ; CHECK-NEXT: lh a2, 0(a0) -; CHECK-NEXT: vsetivli zero, 2, e16, mf2, tu, ma +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vmv.s.x v9, a2 +; CHECK-NEXT: vsetivli zero, 2, e16, mf2, tu, ma ; CHECK-NEXT: vslideup.vi v8, v9, 1 ; CHECK-NEXT: addi a0, a0, 2 ; CHECK-NEXT: andi a2, a1, 4 @@ -327,15 +332,16 @@ define <8 x i16> @expandload_v8i16(ptr %base, <8 x i16> %src0, <8 x i1> %mask) { ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB7_9: # %cond.load ; CHECK-NEXT: lh a2, 0(a0) -; CHECK-NEXT: vsetivli zero, 8, e16, m2, tu, ma +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma ; CHECK-NEXT: vmv.s.x v8, a2 ; CHECK-NEXT: addi a0, a0, 2 ; CHECK-NEXT: andi a2, a1, 2 ; CHECK-NEXT: beqz a2, .LBB7_2 ; CHECK-NEXT: .LBB7_10: # %cond.load1 ; CHECK-NEXT: lh a2, 0(a0) -; CHECK-NEXT: vsetivli zero, 2, e16, m1, tu, ma +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vmv.s.x v9, a2 +; CHECK-NEXT: vsetivli zero, 2, e16, m1, tu, ma ; CHECK-NEXT: vslideup.vi v8, v9, 1 ; CHECK-NEXT: addi a0, a0, 2 ; CHECK-NEXT: andi a2, a1, 4 @@ -421,15 +427,16 @@ define <2 x i32> @expandload_v2i32(ptr %base, <2 x i32> %src0, <2 x i1> %mask) { ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB9_3: # %cond.load ; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: vsetivli zero, 2, e32, m4, tu, ma +; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma ; CHECK-NEXT: vmv.s.x v8, a2 ; CHECK-NEXT: addi a0, a0, 4 ; CHECK-NEXT: andi a1, a1, 2 ; CHECK-NEXT: beqz a1, .LBB9_2 ; CHECK-NEXT: .LBB9_4: # %cond.load1 ; CHECK-NEXT: lw a0, 0(a0) -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vslideup.vi v8, v9, 1 ; CHECK-NEXT: ret %res = call <2 x i32> @llvm.masked.expandload.v2i32(ptr align 4 %base, <2 x i1> %mask, <2 x i32> %src0) @@ -457,15 +464,16 @@ define <4 x i32> @expandload_v4i32(ptr %base, <4 x i32> %src0, <4 x i1> %mask) { ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB10_5: # %cond.load ; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: vsetivli zero, 4, e32, m4, tu, ma +; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma ; CHECK-NEXT: vmv.s.x v8, a2 ; CHECK-NEXT: addi a0, a0, 4 ; CHECK-NEXT: andi a2, a1, 2 ; CHECK-NEXT: beqz a2, .LBB10_2 ; CHECK-NEXT: .LBB10_6: # %cond.load1 ; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vmv.s.x v9, a2 +; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; CHECK-NEXT: vslideup.vi v8, v9, 1 ; CHECK-NEXT: addi a0, a0, 4 ; CHECK-NEXT: andi a2, a1, 4 @@ -521,15 +529,16 @@ define <8 x i32> @expandload_v8i32(ptr %base, <8 x i32> %src0, <8 x i1> %mask) { ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB11_9: # %cond.load ; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: vsetivli zero, 8, e32, m4, tu, ma +; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma ; CHECK-NEXT: vmv.s.x v8, a2 ; CHECK-NEXT: addi a0, a0, 4 ; CHECK-NEXT: andi a2, a1, 2 ; CHECK-NEXT: beqz a2, .LBB11_2 ; CHECK-NEXT: .LBB11_10: # %cond.load1 ; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vmv.s.x v10, a2 +; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; CHECK-NEXT: vslideup.vi v8, v10, 1 ; CHECK-NEXT: addi a0, a0, 4 ; CHECK-NEXT: andi a2, a1, 4 @@ -664,15 +673,16 @@ define <2 x i64> @expandload_v2i64(ptr %base, <2 x i64> %src0, <2 x i1> %mask) { ; RV64-NEXT: ret ; RV64-NEXT: .LBB13_3: # %cond.load ; RV64-NEXT: ld a2, 0(a0) -; RV64-NEXT: vsetivli zero, 2, e64, m8, tu, ma +; RV64-NEXT: vsetvli zero, zero, e64, m8, tu, ma ; RV64-NEXT: vmv.s.x v8, a2 ; RV64-NEXT: addi a0, a0, 8 ; RV64-NEXT: andi a1, a1, 2 ; RV64-NEXT: beqz a1, .LBB13_2 ; RV64-NEXT: .LBB13_4: # %cond.load1 ; RV64-NEXT: ld a0, 0(a0) -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vslideup.vi v8, v9, 1 ; RV64-NEXT: ret %res = call <2 x i64> @llvm.masked.expandload.v2i64(ptr align 8 %base, <2 x i1> %mask, <2 x i64> %src0) @@ -758,15 +768,16 @@ define <4 x i64> @expandload_v4i64(ptr %base, <4 x i64> %src0, <4 x i1> %mask) { ; RV64-NEXT: ret ; RV64-NEXT: .LBB14_5: # %cond.load ; RV64-NEXT: ld a2, 0(a0) -; RV64-NEXT: vsetivli zero, 4, e64, m8, tu, ma +; RV64-NEXT: vsetvli zero, zero, e64, m8, tu, ma ; RV64-NEXT: vmv.s.x v8, a2 ; RV64-NEXT: addi a0, a0, 8 ; RV64-NEXT: andi a2, a1, 2 ; RV64-NEXT: beqz a2, .LBB14_2 ; RV64-NEXT: .LBB14_6: # %cond.load1 ; RV64-NEXT: ld a2, 0(a0) -; RV64-NEXT: vsetivli zero, 2, e64, m1, tu, ma +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; RV64-NEXT: vmv.s.x v10, a2 +; RV64-NEXT: vsetivli zero, 2, e64, m1, tu, ma ; RV64-NEXT: vslideup.vi v8, v10, 1 ; RV64-NEXT: addi a0, a0, 8 ; RV64-NEXT: andi a2, a1, 4 @@ -936,15 +947,16 @@ define <8 x i64> @expandload_v8i64(ptr %base, <8 x i64> %src0, <8 x i1> %mask) { ; RV64-NEXT: ret ; RV64-NEXT: .LBB15_9: # %cond.load ; RV64-NEXT: ld a2, 0(a0) -; RV64-NEXT: vsetivli zero, 8, e64, m8, tu, ma +; RV64-NEXT: vsetvli zero, zero, e64, m8, tu, ma ; RV64-NEXT: vmv.s.x v8, a2 ; RV64-NEXT: addi a0, a0, 8 ; RV64-NEXT: andi a2, a1, 2 ; RV64-NEXT: beqz a2, .LBB15_2 ; RV64-NEXT: .LBB15_10: # %cond.load1 ; RV64-NEXT: ld a2, 0(a0) -; RV64-NEXT: vsetivli zero, 2, e64, m1, tu, ma +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; RV64-NEXT: vmv.s.x v12, a2 +; RV64-NEXT: vsetivli zero, 2, e64, m1, tu, ma ; RV64-NEXT: vslideup.vi v8, v12, 1 ; RV64-NEXT: addi a0, a0, 8 ; RV64-NEXT: andi a2, a1, 4 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll index 9463267d0b0e62..2d3865ba4533d1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll @@ -26,7 +26,7 @@ define <1 x i64> @llrint_v1i64_v1f32(<1 x float> %x) { ; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; RV64-NEXT: vfmv.f.s fa5, v8 ; RV64-NEXT: fcvt.l.s a0, fa5 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; RV64-NEXT: vmv.s.x v8, a0 ; RV64-NEXT: ret %a = call <1 x i64> @llvm.llrint.v1i64.v1f32(<1 x float> %x) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll index 9b0944e7e2f721..de47d8572017b7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll @@ -28,7 +28,7 @@ define <1 x iXLen> @lrint_v1f32(<1 x float> %x) { ; RV64-i64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; RV64-i64-NEXT: vfmv.f.s fa5, v8 ; RV64-i64-NEXT: fcvt.l.s a0, fa5 -; RV64-i64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-i64-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; RV64-i64-NEXT: vmv.s.x v8, a0 ; RV64-i64-NEXT: ret %a = call <1 x iXLen> @llvm.lrint.v1iXLen.v1f32(<1 x float> %x) @@ -609,7 +609,6 @@ define <1 x iXLen> @lrint_v1f64(<1 x double> %x) { ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vfmv.f.s fa5, v8 ; RV32-NEXT: fcvt.w.d a0, fa5 -; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; RV32-NEXT: vmv.s.x v8, a0 ; RV32-NEXT: ret ; @@ -618,7 +617,6 @@ define <1 x iXLen> @lrint_v1f64(<1 x double> %x) { ; RV64-i32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV64-i32-NEXT: vfmv.f.s fa5, v8 ; RV64-i32-NEXT: fcvt.l.d a0, fa5 -; RV64-i32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; RV64-i32-NEXT: vmv.s.x v8, a0 ; RV64-i32-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll index 69341981288b91..a4f9eeb59cd5bf 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -83,14 +83,15 @@ define <2 x i8> @mgather_v2i8(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i8> %passthru) ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB1_3: # %cond.load ; RV64ZVE32F-NEXT: lbu a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB1_2 ; RV64ZVE32F-NEXT: .LBB1_4: # %cond.load1 ; RV64ZVE32F-NEXT: lbu a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: ret %v = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> %ptrs, i32 1, <2 x i1> %m, <2 x i8> %passthru) @@ -130,15 +131,16 @@ define <2 x i16> @mgather_v2i8_sextload_v2i16(<2 x ptr> %ptrs, <2 x i1> %m, <2 x ; RV64ZVE32F-NEXT: beqz a3, .LBB2_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: lbu a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: .LBB2_2: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB2_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lbu a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB2_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma @@ -183,15 +185,16 @@ define <2 x i16> @mgather_v2i8_zextload_v2i16(<2 x ptr> %ptrs, <2 x i1> %m, <2 x ; RV64ZVE32F-NEXT: beqz a3, .LBB3_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: lbu a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: .LBB3_2: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB3_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lbu a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB3_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma @@ -236,15 +239,16 @@ define <2 x i32> @mgather_v2i8_sextload_v2i32(<2 x ptr> %ptrs, <2 x i1> %m, <2 x ; RV64ZVE32F-NEXT: beqz a3, .LBB4_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: lbu a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: .LBB4_2: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB4_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lbu a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB4_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma @@ -289,15 +293,16 @@ define <2 x i32> @mgather_v2i8_zextload_v2i32(<2 x ptr> %ptrs, <2 x i1> %m, <2 x ; RV64ZVE32F-NEXT: beqz a3, .LBB5_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: lbu a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: .LBB5_2: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB5_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lbu a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB5_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma @@ -349,15 +354,16 @@ define <2 x i64> @mgather_v2i8_sextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2 x ; RV64ZVE32F-NEXT: beqz a3, .LBB6_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: lbu a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: .LBB6_2: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB6_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lbu a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB6_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma @@ -410,15 +416,16 @@ define <2 x i64> @mgather_v2i8_zextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2 x ; RV64ZVE32F-NEXT: beqz a3, .LBB7_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: lbu a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: .LBB7_2: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB7_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lbu a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB7_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma @@ -470,15 +477,16 @@ define <4 x i8> @mgather_v4i8(<4 x ptr> %ptrs, <4 x i1> %m, <4 x i8> %passthru) ; RV64ZVE32F-NEXT: .LBB8_5: # %cond.load ; RV64ZVE32F-NEXT: ld a2, 0(a0) ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB8_2 ; RV64ZVE32F-NEXT: .LBB8_6: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB8_3 @@ -602,15 +610,16 @@ define <8 x i8> @mgather_v8i8(<8 x ptr> %ptrs, <8 x i1> %m, <8 x i8> %passthru) ; RV64ZVE32F-NEXT: .LBB11_9: # %cond.load ; RV64ZVE32F-NEXT: ld a2, 0(a0) ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB11_2 ; RV64ZVE32F-NEXT: .LBB11_10: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB11_3 @@ -694,7 +703,7 @@ define <8 x i8> @mgather_baseidx_v8i8(ptr %base, <8 x i8> %idxs, <8 x i1> %m, <8 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: .LBB12_2: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -874,14 +883,15 @@ define <2 x i16> @mgather_v2i16(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i16> %passthr ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB14_3: # %cond.load ; RV64ZVE32F-NEXT: lh a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB14_2 ; RV64ZVE32F-NEXT: .LBB14_4: # %cond.load1 ; RV64ZVE32F-NEXT: lh a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: ret %v = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> %ptrs, i32 2, <2 x i1> %m, <2 x i16> %passthru) @@ -921,15 +931,16 @@ define <2 x i32> @mgather_v2i16_sextload_v2i32(<2 x ptr> %ptrs, <2 x i1> %m, <2 ; RV64ZVE32F-NEXT: beqz a3, .LBB15_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: lh a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: .LBB15_2: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB15_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lh a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB15_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma @@ -974,15 +985,16 @@ define <2 x i32> @mgather_v2i16_zextload_v2i32(<2 x ptr> %ptrs, <2 x i1> %m, <2 ; RV64ZVE32F-NEXT: beqz a3, .LBB16_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: lh a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: .LBB16_2: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB16_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lh a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB16_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma @@ -1034,15 +1046,16 @@ define <2 x i64> @mgather_v2i16_sextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2 ; RV64ZVE32F-NEXT: beqz a3, .LBB17_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: lh a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: .LBB17_2: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB17_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lh a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB17_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma @@ -1097,15 +1110,16 @@ define <2 x i64> @mgather_v2i16_zextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2 ; RV64ZVE32F-NEXT: beqz a3, .LBB18_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: lh a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: .LBB18_2: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB18_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lh a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB18_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma @@ -1159,15 +1173,16 @@ define <4 x i16> @mgather_v4i16(<4 x ptr> %ptrs, <4 x i1> %m, <4 x i16> %passthr ; RV64ZVE32F-NEXT: .LBB19_5: # %cond.load ; RV64ZVE32F-NEXT: ld a2, 0(a0) ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB19_2 ; RV64ZVE32F-NEXT: .LBB19_6: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB19_3 @@ -1291,15 +1306,16 @@ define <8 x i16> @mgather_v8i16(<8 x ptr> %ptrs, <8 x i1> %m, <8 x i16> %passthr ; RV64ZVE32F-NEXT: .LBB22_9: # %cond.load ; RV64ZVE32F-NEXT: ld a2, 0(a0) ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB22_2 ; RV64ZVE32F-NEXT: .LBB22_10: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB22_3 @@ -1386,7 +1402,7 @@ define <8 x i16> @mgather_baseidx_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: .LBB23_2: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -1470,8 +1486,9 @@ define <8 x i16> @mgather_baseidx_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB23_8 @@ -1537,7 +1554,7 @@ define <8 x i16> @mgather_baseidx_sext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: .LBB24_2: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -1621,8 +1638,9 @@ define <8 x i16> @mgather_baseidx_sext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB24_8 @@ -1688,7 +1706,7 @@ define <8 x i16> @mgather_baseidx_zext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: .LBB25_2: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -1777,8 +1795,9 @@ define <8 x i16> @mgather_baseidx_zext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB25_8 @@ -1841,7 +1860,7 @@ define <8 x i16> @mgather_baseidx_v8i16(ptr %base, <8 x i16> %idxs, <8 x i1> %m, ; RV64ZVE32F-NEXT: andi a2, a1, 1 ; RV64ZVE32F-NEXT: beqz a2, .LBB26_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, tu, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -2032,14 +2051,15 @@ define <2 x i32> @mgather_v2i32(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i32> %passthr ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB28_3: # %cond.load ; RV64ZVE32F-NEXT: lw a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m4, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB28_2 ; RV64ZVE32F-NEXT: .LBB28_4: # %cond.load1 ; RV64ZVE32F-NEXT: lw a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: ret %v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %m, <2 x i32> %passthru) @@ -2088,15 +2108,16 @@ define <2 x i64> @mgather_v2i32_sextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2 ; RV64ZVE32F-NEXT: beqz a3, .LBB29_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: lw a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m4, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: .LBB29_2: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB29_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lw a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB29_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma @@ -2147,15 +2168,16 @@ define <2 x i64> @mgather_v2i32_zextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2 ; RV64ZVE32F-NEXT: beqz a3, .LBB30_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: lw a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m4, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: .LBB30_2: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB30_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lw a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB30_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma @@ -2209,15 +2231,16 @@ define <4 x i32> @mgather_v4i32(<4 x ptr> %ptrs, <4 x i1> %m, <4 x i32> %passthr ; RV64ZVE32F-NEXT: .LBB31_5: # %cond.load ; RV64ZVE32F-NEXT: ld a2, 0(a0) ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m4, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB31_2 ; RV64ZVE32F-NEXT: .LBB31_6: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB31_3 @@ -2340,15 +2363,16 @@ define <8 x i32> @mgather_v8i32(<8 x ptr> %ptrs, <8 x i1> %m, <8 x i32> %passthr ; RV64ZVE32F-NEXT: .LBB34_9: # %cond.load ; RV64ZVE32F-NEXT: ld a2, 0(a0) ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m4, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB34_2 ; RV64ZVE32F-NEXT: .LBB34_10: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 1 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB34_3 @@ -2434,7 +2458,7 @@ define <8 x i32> @mgather_baseidx_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m4, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: .LBB35_2: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -2518,8 +2542,9 @@ define <8 x i32> @mgather_baseidx_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB35_8 @@ -2584,7 +2609,7 @@ define <8 x i32> @mgather_baseidx_sext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m4, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: .LBB36_2: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -2668,8 +2693,9 @@ define <8 x i32> @mgather_baseidx_sext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB36_8 @@ -2737,7 +2763,7 @@ define <8 x i32> @mgather_baseidx_zext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m4, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: .LBB37_2: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -2826,8 +2852,9 @@ define <8 x i32> @mgather_baseidx_zext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB37_8 @@ -2896,7 +2923,7 @@ define <8 x i32> @mgather_baseidx_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <8 x i ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m4, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: .LBB38_2: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -2980,8 +3007,9 @@ define <8 x i32> @mgather_baseidx_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <8 x i ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB38_8 @@ -3047,7 +3075,7 @@ define <8 x i32> @mgather_baseidx_sext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, < ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m4, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: .LBB39_2: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -3131,8 +3159,9 @@ define <8 x i32> @mgather_baseidx_sext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, < ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB39_8 @@ -3201,7 +3230,7 @@ define <8 x i32> @mgather_baseidx_zext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, < ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: lw a3, 0(a3) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m4, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v10, a3 ; RV64ZVE32F-NEXT: .LBB40_2: # %else ; RV64ZVE32F-NEXT: andi a3, a2, 2 @@ -3290,8 +3319,9 @@ define <8 x i32> @mgather_baseidx_zext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, < ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: lw a3, 0(a3) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a3, a2, 32 ; RV64ZVE32F-NEXT: bnez a3, .LBB40_8 @@ -3354,7 +3384,7 @@ define <8 x i32> @mgather_baseidx_v8i32(ptr %base, <8 x i32> %idxs, <8 x i1> %m, ; RV64ZVE32F-NEXT: andi a2, a1, 1 ; RV64ZVE32F-NEXT: beqz a2, .LBB41_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m4, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, tu, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -6915,14 +6945,15 @@ define <2 x half> @mgather_v2f16(<2 x ptr> %ptrs, <2 x i1> %m, <2 x half> %passt ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB59_3: # %cond.load ; RV64ZVE32F-NEXT: flh fa5, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, tu, ma ; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB59_2 ; RV64ZVE32F-NEXT: .LBB59_4: # %cond.load1 ; RV64ZVE32F-NEXT: flh fa5, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: ret %v = call <2 x half> @llvm.masked.gather.v2f16.v2p0(<2 x ptr> %ptrs, i32 2, <2 x i1> %m, <2 x half> %passthru) @@ -6966,15 +6997,16 @@ define <4 x half> @mgather_v4f16(<4 x ptr> %ptrs, <4 x i1> %m, <4 x half> %passt ; RV64ZVE32F-NEXT: .LBB60_5: # %cond.load ; RV64ZVE32F-NEXT: ld a2, 0(a0) ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, tu, ma ; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB60_2 ; RV64ZVE32F-NEXT: .LBB60_6: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB60_3 @@ -7098,15 +7130,16 @@ define <8 x half> @mgather_v8f16(<8 x ptr> %ptrs, <8 x i1> %m, <8 x half> %passt ; RV64ZVE32F-NEXT: .LBB63_9: # %cond.load ; RV64ZVE32F-NEXT: ld a2, 0(a0) ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, tu, ma ; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB63_2 ; RV64ZVE32F-NEXT: .LBB63_10: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB63_3 @@ -7193,7 +7226,7 @@ define <8 x half> @mgather_baseidx_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 x i1 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, tu, ma ; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 ; RV64ZVE32F-NEXT: .LBB64_2: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -7277,8 +7310,9 @@ define <8 x half> @mgather_baseidx_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 x i1 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB64_8 @@ -7344,7 +7378,7 @@ define <8 x half> @mgather_baseidx_sext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, tu, ma ; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 ; RV64ZVE32F-NEXT: .LBB65_2: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -7428,8 +7462,9 @@ define <8 x half> @mgather_baseidx_sext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB65_8 @@ -7495,7 +7530,7 @@ define <8 x half> @mgather_baseidx_zext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, tu, ma ; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 ; RV64ZVE32F-NEXT: .LBB66_2: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -7584,8 +7619,9 @@ define <8 x half> @mgather_baseidx_zext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB66_8 @@ -7648,7 +7684,7 @@ define <8 x half> @mgather_baseidx_v8f16(ptr %base, <8 x i16> %idxs, <8 x i1> %m ; RV64ZVE32F-NEXT: andi a2, a1, 1 ; RV64ZVE32F-NEXT: beqz a2, .LBB67_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, tu, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -7839,14 +7875,15 @@ define <2 x float> @mgather_v2f32(<2 x ptr> %ptrs, <2 x i1> %m, <2 x float> %pas ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB69_3: # %cond.load ; RV64ZVE32F-NEXT: flw fa5, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m4, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, tu, ma ; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB69_2 ; RV64ZVE32F-NEXT: .LBB69_4: # %cond.load1 ; RV64ZVE32F-NEXT: flw fa5, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: ret %v = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %m, <2 x float> %passthru) @@ -7890,15 +7927,16 @@ define <4 x float> @mgather_v4f32(<4 x ptr> %ptrs, <4 x i1> %m, <4 x float> %pas ; RV64ZVE32F-NEXT: .LBB70_5: # %cond.load ; RV64ZVE32F-NEXT: ld a2, 0(a0) ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m4, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, tu, ma ; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB70_2 ; RV64ZVE32F-NEXT: .LBB70_6: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB70_3 @@ -8021,15 +8059,16 @@ define <8 x float> @mgather_v8f32(<8 x ptr> %ptrs, <8 x i1> %m, <8 x float> %pas ; RV64ZVE32F-NEXT: .LBB73_9: # %cond.load ; RV64ZVE32F-NEXT: ld a2, 0(a0) ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m4, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, tu, ma ; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB73_2 ; RV64ZVE32F-NEXT: .LBB73_10: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 1 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB73_3 @@ -8115,7 +8154,7 @@ define <8 x float> @mgather_baseidx_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <8 x i ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m4, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, tu, ma ; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 ; RV64ZVE32F-NEXT: .LBB74_2: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -8199,8 +8238,9 @@ define <8 x float> @mgather_baseidx_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <8 x i ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB74_8 @@ -8265,7 +8305,7 @@ define <8 x float> @mgather_baseidx_sext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, < ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m4, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, tu, ma ; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 ; RV64ZVE32F-NEXT: .LBB75_2: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -8349,8 +8389,9 @@ define <8 x float> @mgather_baseidx_sext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, < ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB75_8 @@ -8418,7 +8459,7 @@ define <8 x float> @mgather_baseidx_zext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, < ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m4, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, tu, ma ; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 ; RV64ZVE32F-NEXT: .LBB76_2: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -8507,8 +8548,9 @@ define <8 x float> @mgather_baseidx_zext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, < ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB76_8 @@ -8577,7 +8619,7 @@ define <8 x float> @mgather_baseidx_v8i16_v8f32(ptr %base, <8 x i16> %idxs, <8 x ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m4, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, tu, ma ; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 ; RV64ZVE32F-NEXT: .LBB77_2: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -8661,8 +8703,9 @@ define <8 x float> @mgather_baseidx_v8i16_v8f32(ptr %base, <8 x i16> %idxs, <8 x ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB77_8 @@ -8728,7 +8771,7 @@ define <8 x float> @mgather_baseidx_sext_v8i16_v8f32(ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m4, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, tu, ma ; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 ; RV64ZVE32F-NEXT: .LBB78_2: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -8812,8 +8855,9 @@ define <8 x float> @mgather_baseidx_sext_v8i16_v8f32(ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: bnez a2, .LBB78_8 @@ -8882,7 +8926,7 @@ define <8 x float> @mgather_baseidx_zext_v8i16_v8f32(ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: flw fa5, 0(a3) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m4, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, tu, ma ; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 ; RV64ZVE32F-NEXT: .LBB79_2: # %else ; RV64ZVE32F-NEXT: andi a3, a2, 2 @@ -8971,8 +9015,9 @@ define <8 x float> @mgather_baseidx_zext_v8i16_v8f32(ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: flw fa5, 0(a3) -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a3, a2, 32 ; RV64ZVE32F-NEXT: bnez a3, .LBB79_8 @@ -9035,7 +9080,7 @@ define <8 x float> @mgather_baseidx_v8f32(ptr %base, <8 x i32> %idxs, <8 x i1> % ; RV64ZVE32F-NEXT: andi a2, a1, 1 ; RV64ZVE32F-NEXT: beqz a2, .LBB80_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m4, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, tu, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -11858,7 +11903,7 @@ define <16 x i8> @mgather_baseidx_v16i8(ptr %base, <16 x i8> %idxs, <16 x i1> %m ; RV64ZVE32F-NEXT: andi a2, a1, 1 ; RV64ZVE32F-NEXT: beqz a2, .LBB97_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, mf2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, tu, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) @@ -12113,18 +12158,16 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m ; RV64ZVE32F-NEXT: andi a2, a1, 1 ; RV64ZVE32F-NEXT: beqz a2, .LBB98_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, tu, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, mf4, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: .LBB98_2: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB98_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: add a2, a0, a2 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll index e6852c1b57510d..1dd74a7c9dd1b1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll @@ -2387,7 +2387,7 @@ define void @mscatter_baseidx_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i16> % ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) @@ -2525,7 +2525,7 @@ define void @mscatter_baseidx_sext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) @@ -2671,7 +2671,7 @@ define void @mscatter_baseidx_zext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a3) @@ -7507,7 +7507,7 @@ define void @mscatter_baseidx_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x i16> ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) @@ -7645,7 +7645,7 @@ define void @mscatter_baseidx_sext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) @@ -7791,7 +7791,7 @@ define void @mscatter_baseidx_zext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a3) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll index 805b548b0cd185..eba3bd9d86dbb3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll @@ -94,7 +94,7 @@ define <2 x i16> @mgather_v2i16_align1(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i16> % ; RV32-SLOW-NEXT: lbu a1, 0(a1) ; RV32-SLOW-NEXT: slli a2, a2, 8 ; RV32-SLOW-NEXT: or a1, a2, a1 -; RV32-SLOW-NEXT: vsetivli zero, 2, e16, m2, tu, ma +; RV32-SLOW-NEXT: vsetvli zero, zero, e16, m2, tu, ma ; RV32-SLOW-NEXT: vmv.s.x v9, a1 ; RV32-SLOW-NEXT: andi a0, a0, 2 ; RV32-SLOW-NEXT: beqz a0, .LBB4_2 @@ -131,7 +131,7 @@ define <2 x i16> @mgather_v2i16_align1(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i16> % ; RV64-SLOW-NEXT: lbu a1, 0(a1) ; RV64-SLOW-NEXT: slli a2, a2, 8 ; RV64-SLOW-NEXT: or a1, a2, a1 -; RV64-SLOW-NEXT: vsetivli zero, 2, e16, m2, tu, ma +; RV64-SLOW-NEXT: vsetvli zero, zero, e16, m2, tu, ma ; RV64-SLOW-NEXT: vmv.s.x v9, a1 ; RV64-SLOW-NEXT: andi a0, a0, 2 ; RV64-SLOW-NEXT: beqz a0, .LBB4_2 @@ -217,7 +217,7 @@ define <2 x i64> @mgather_v2i64_align4(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i64> % ; RV64-SLOW-NEXT: vmv1r.v v8, v9 ; RV64-SLOW-NEXT: ret ; RV64-SLOW-NEXT: .LBB5_3: # %cond.load -; RV64-SLOW-NEXT: vsetivli zero, 2, e64, m8, tu, ma +; RV64-SLOW-NEXT: vsetvli zero, zero, e64, m8, tu, ma ; RV64-SLOW-NEXT: vmv.x.s a1, v8 ; RV64-SLOW-NEXT: lwu a2, 4(a1) ; RV64-SLOW-NEXT: lwu a1, 0(a1) diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-O0.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-O0.ll new file mode 100644 index 00000000000000..aef18fcd06cd60 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-O0.ll @@ -0,0 +1,154 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -mattr=+m,+f,+d,+a,+c,+v \ +; RUN: -target-abi=lp64d -verify-machineinstrs -O0 < %s | FileCheck %s + +declare i64 @llvm.riscv.vsetvli(i64, i64, i64) +declare i64 @llvm.riscv.vsetvlimax(i64, i64) +declare @llvm.riscv.vfadd.nxv1f64.nxv1f64( + , + , + , + i64, i64) +declare @llvm.riscv.vle.mask.nxv1i64( + , + ptr, + , + i64, i64) + +define <2 x double> @fixed_length(<2 x double> %a, <2 x double> %b) nounwind { +; CHECK-LABEL: fixed_length: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmv1r.v v10, v9 +; CHECK-NEXT: # kill: def $v11 killed $v10 +; CHECK-NEXT: # kill: def $v9 killed $v8 +; CHECK-NEXT: # implicit-def: $v9 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vfadd.vv v9, v8, v10 +; CHECK-NEXT: # implicit-def: $v8 +; CHECK-NEXT: vfadd.vv v8, v9, v10 +; CHECK-NEXT: ret +entry: + %1 = fadd <2 x double> %a, %b + %2 = fadd <2 x double> %1, %b + ret <2 x double> %2 +} + +define @scalable( %a, %b) nounwind { +; CHECK-LABEL: scalable: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmv1r.v v10, v9 +; CHECK-NEXT: # implicit-def: $v9 +; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-NEXT: vfadd.vv v9, v8, v10 +; CHECK-NEXT: # implicit-def: $v8 +; CHECK-NEXT: vfadd.vv v8, v9, v10 +; CHECK-NEXT: ret +entry: + %1 = fadd %a, %b + %2 = fadd %1, %b + ret %2 +} + + +define @intrinsic_same_vlmax( %a, %b) nounwind { +; CHECK-LABEL: intrinsic_same_vlmax: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmv1r.v v10, v9 +; CHECK-NEXT: vsetvli a0, zero, e64, m1, tu, ma +; CHECK-NEXT: # implicit-def: $v9 +; CHECK-NEXT: vfadd.vv v9, v8, v10 +; CHECK-NEXT: # implicit-def: $v8 +; CHECK-NEXT: vfadd.vv v8, v9, v10 +; CHECK-NEXT: ret +entry: + %0 = tail call i64 @llvm.riscv.vsetvlimax(i64 2, i64 7) + %1 = tail call @llvm.riscv.vfadd.nxv1f64.nxv1f64( + undef, + %a, + %b, + i64 7, i64 %0) + %2 = tail call @llvm.riscv.vfadd.nxv1f64.nxv1f64( + undef, + %1, + %b, + i64 7, i64 %0) + ret %2 +} + + +define @intrinsic_same_avl_imm( %a, %b) nounwind { +; CHECK-LABEL: intrinsic_same_avl_imm: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmv1r.v v10, v9 +; CHECK-NEXT: vsetivli a0, 2, e64, m1, tu, ma +; CHECK-NEXT: # implicit-def: $v9 +; CHECK-NEXT: vfadd.vv v9, v8, v10 +; CHECK-NEXT: # implicit-def: $v8 +; CHECK-NEXT: vfadd.vv v8, v9, v10 +; CHECK-NEXT: ret +entry: + %0 = tail call i64 @llvm.riscv.vsetvli(i64 2, i64 2, i64 7) + %1 = tail call @llvm.riscv.vfadd.nxv1f64.nxv1f64( + undef, + %a, + %b, + i64 7, i64 %0) + %2 = tail call @llvm.riscv.vfadd.nxv1f64.nxv1f64( + undef, + %1, + %b, + i64 7, i64 %0) + ret %2 +} + +define @intrinsic_same_avl_reg(i64 %avl, %a, %b) nounwind { +; CHECK-LABEL: intrinsic_same_avl_reg: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmv1r.v v10, v9 +; CHECK-NEXT: vsetvli a0, a0, e64, m1, tu, ma +; CHECK-NEXT: # implicit-def: $v9 +; CHECK-NEXT: vfadd.vv v9, v8, v10 +; CHECK-NEXT: # implicit-def: $v8 +; CHECK-NEXT: vfadd.vv v8, v9, v10 +; CHECK-NEXT: ret +entry: + %0 = tail call i64 @llvm.riscv.vsetvli(i64 %avl, i64 2, i64 7) + %1 = tail call @llvm.riscv.vfadd.nxv1f64.nxv1f64( + undef, + %a, + %b, + i64 7, i64 %0) + %2 = tail call @llvm.riscv.vfadd.nxv1f64.nxv1f64( + undef, + %1, + %b, + i64 7, i64 %0) + ret %2 +} + +define @intrinsic_diff_avl_reg(i64 %avl, i64 %avl2, %a, %b) nounwind { +; CHECK-LABEL: intrinsic_diff_avl_reg: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmv1r.v v10, v9 +; CHECK-NEXT: vsetvli a0, a0, e64, m1, tu, ma +; CHECK-NEXT: # implicit-def: $v9 +; CHECK-NEXT: vfadd.vv v9, v8, v10 +; CHECK-NEXT: vsetvli a0, a1, e64, m1, tu, ma +; CHECK-NEXT: # implicit-def: $v8 +; CHECK-NEXT: vfadd.vv v8, v9, v10 +; CHECK-NEXT: ret +entry: + %0 = tail call i64 @llvm.riscv.vsetvli(i64 %avl, i64 2, i64 7) + %1 = tail call @llvm.riscv.vfadd.nxv1f64.nxv1f64( + undef, + %a, + %b, + i64 7, i64 %0) + %2 = tail call i64 @llvm.riscv.vsetvli(i64 %avl2, i64 2, i64 7) + %3 = tail call @llvm.riscv.vfadd.nxv1f64.nxv1f64( + undef, + %1, + %b, + i64 7, i64 %2) + ret %3 +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir index 295d4c57a1be5c..4091d1711b5841 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir +++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir @@ -976,12 +976,12 @@ body: | ; CHECK: bb.0: ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 1, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: PseudoBR %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 1, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: dead %x:gpr = PseudoVMV_X_S undef $noreg, 6 /* e64 */, implicit $vtype ; CHECK-NEXT: PseudoBR %bb.1 bb.0: diff --git a/llvm/test/DebugInfo/Generic/sroa-extract-bits.ll b/llvm/test/DebugInfo/Generic/sroa-extract-bits.ll index a7e2a203b2e611..fe41b3e60f442b 100644 --- a/llvm/test/DebugInfo/Generic/sroa-extract-bits.ll +++ b/llvm/test/DebugInfo/Generic/sroa-extract-bits.ll @@ -10,11 +10,11 @@ define i8 @test1(i32 %arg) { ; CHECK-SAME: i32 [[ARG:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[PTR_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[ARG]] to i8 -; CHECK-NEXT: tail call void @llvm.dbg.value(metadata i8 [[PTR_SROA_0_0_EXTRACT_TRUNC]], metadata [[META2:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 8)), !dbg [[DBG7:![0-9]+]] +; CHECK-NEXT: #dbg_value(i8 [[PTR_SROA_0_0_EXTRACT_TRUNC]], [[META2:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 8), [[META7:![0-9]+]]) ; CHECK-NEXT: [[PTR_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i32 [[ARG]], 8 ; CHECK-NEXT: [[PTR_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[PTR_SROA_2_0_EXTRACT_SHIFT]] to i24 -; CHECK-NEXT: tail call void @llvm.dbg.value(metadata i24 [[PTR_SROA_2_0_EXTRACT_TRUNC]], metadata [[META8:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_extract_bits_zext, 8, 16)), !dbg [[DBG7]] -; CHECK-NEXT: tail call void @llvm.dbg.value(metadata i24 [[PTR_SROA_2_0_EXTRACT_TRUNC]], metadata [[META9:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 8)), !dbg [[DBG7]] +; CHECK-NEXT: #dbg_value(i24 [[PTR_SROA_2_0_EXTRACT_TRUNC]], [[META8:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 8, 16), [[META7]]) +; CHECK-NEXT: #dbg_value(i24 [[PTR_SROA_2_0_EXTRACT_TRUNC]], [[META9:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 8), [[META7]]) ; CHECK-NEXT: ret i8 [[PTR_SROA_0_0_EXTRACT_TRUNC]] ; entry: @@ -33,14 +33,14 @@ define i8 @test2(i32 %arg1, i8 %arg2) { ; CHECK-SAME: i32 [[ARG1:%.*]], i8 [[ARG2:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[PTR_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[ARG1]] to i8 -; CHECK-NEXT: tail call void @llvm.dbg.value(metadata i8 [[PTR_SROA_0_0_EXTRACT_TRUNC]], metadata [[META2]], metadata !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 8)), !dbg [[DBG7]] +; CHECK-NEXT: #dbg_value(i8 [[PTR_SROA_0_0_EXTRACT_TRUNC]], [[META2]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 8), [[META7]]) ; CHECK-NEXT: [[PTR_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i32 [[ARG1]], 8 ; CHECK-NEXT: [[PTR_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[PTR_SROA_2_0_EXTRACT_SHIFT]] to i16 -; CHECK-NEXT: tail call void @llvm.dbg.value(metadata i16 [[PTR_SROA_2_0_EXTRACT_TRUNC]], metadata [[META9]], metadata !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 16)), !dbg [[DBG7]] +; CHECK-NEXT: #dbg_value(i16 [[PTR_SROA_2_0_EXTRACT_TRUNC]], [[META9]], !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 16), [[META7]]) ; CHECK-NEXT: [[PTR_SROA_21_0_EXTRACT_SHIFT:%.*]] = lshr i32 [[ARG1]], 24 ; CHECK-NEXT: [[PTR_SROA_21_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[PTR_SROA_21_0_EXTRACT_SHIFT]] to i8 -; CHECK-NEXT: tail call void @llvm.dbg.value(metadata i8 [[PTR_SROA_21_0_EXTRACT_TRUNC]], metadata [[META8]], metadata !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 8)), !dbg [[DBG7]] -; CHECK-NEXT: tail call void @llvm.dbg.value(metadata i8 [[ARG2]], metadata [[META8]], metadata !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 8)), !dbg [[DBG7]] +; CHECK-NEXT: #dbg_value(i8 [[PTR_SROA_21_0_EXTRACT_TRUNC]], [[META8]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 8), [[META7]]) +; CHECK-NEXT: #dbg_value(i8 [[ARG2]], [[META8]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 8), [[META7]]) ; CHECK-NEXT: ret i8 [[PTR_SROA_0_0_EXTRACT_TRUNC]] ; entry: @@ -81,10 +81,10 @@ define i16 @test4(i32 %arg) { ; CHECK-SAME: i32 [[ARG:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[PTR_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[ARG]] to i16 -; CHECK-NEXT: tail call void @llvm.dbg.value(metadata i16 [[PTR_SROA_0_0_EXTRACT_TRUNC]], metadata [[META2]], metadata !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 8)), !dbg [[DBG7]] +; CHECK-NEXT: #dbg_value(i16 [[PTR_SROA_0_0_EXTRACT_TRUNC]], [[META2]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 8), [[META7]]) ; CHECK-NEXT: [[PTR_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i32 [[ARG]], 16 ; CHECK-NEXT: [[PTR_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[PTR_SROA_2_0_EXTRACT_SHIFT]] to i16 -; CHECK-NEXT: tail call void @llvm.dbg.value(metadata i16 [[PTR_SROA_2_0_EXTRACT_TRUNC]], metadata [[META8]], metadata !DIExpression(DW_OP_LLVM_extract_bits_zext, 8, 8)), !dbg [[DBG7]] +; CHECK-NEXT: #dbg_value(i16 [[PTR_SROA_2_0_EXTRACT_TRUNC]], [[META8]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 8, 8), [[META7]]) ; CHECK-NEXT: ret i16 [[PTR_SROA_0_0_EXTRACT_TRUNC]] ; entry: @@ -104,11 +104,11 @@ define i8 @test5(i32 %arg) { ; CHECK-SAME: i32 [[ARG:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[PTR_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[ARG]] to i8 -; CHECK-NEXT: tail call void @llvm.dbg.value(metadata i8 [[PTR_SROA_0_0_EXTRACT_TRUNC]], metadata [[META11:![0-9]+]], metadata !DIExpression()), !dbg [[DBG7]] +; CHECK-NEXT: #dbg_value(i8 [[PTR_SROA_0_0_EXTRACT_TRUNC]], [[META11:![0-9]+]], !DIExpression(), [[META7]]) ; CHECK-NEXT: [[PTR_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i32 [[ARG]], 8 ; CHECK-NEXT: [[PTR_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[PTR_SROA_2_0_EXTRACT_SHIFT]] to i24 -; CHECK-NEXT: tail call void @llvm.dbg.value(metadata i24 [[PTR_SROA_2_0_EXTRACT_TRUNC]], metadata [[META8]], metadata !DIExpression(DW_OP_LLVM_extract_bits_zext, 8, 8)), !dbg [[DBG7]] -; CHECK-NEXT: tail call void @llvm.dbg.value(metadata i24 [[PTR_SROA_2_0_EXTRACT_TRUNC]], metadata [[META9]], metadata !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 8)), !dbg [[DBG7]] +; CHECK-NEXT: #dbg_value(i24 [[PTR_SROA_2_0_EXTRACT_TRUNC]], [[META8]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 8, 8), [[META7]]) +; CHECK-NEXT: #dbg_value(i24 [[PTR_SROA_2_0_EXTRACT_TRUNC]], [[META9]], !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 8), [[META7]]) ; CHECK-NEXT: ret i8 [[PTR_SROA_0_0_EXTRACT_TRUNC]] ; entry: @@ -127,14 +127,14 @@ define i8 @test6(i32 %arg1, i8 %arg2) { ; CHECK-SAME: i32 [[ARG1:%.*]], i8 [[ARG2:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[PTR_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[ARG1]] to i8 -; CHECK-NEXT: tail call void @llvm.dbg.value(metadata i8 undef, metadata [[META2]], metadata !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 8)), !dbg [[DBG7]] +; CHECK-NEXT: #dbg_value(i8 undef, [[META2]], !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 8), [[META7]]) ; CHECK-NEXT: [[PTR_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i32 [[ARG1]], 8 ; CHECK-NEXT: [[PTR_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[PTR_SROA_2_0_EXTRACT_SHIFT]] to i16 -; CHECK-NEXT: tail call void @llvm.dbg.value(metadata i16 undef, metadata [[META9]], metadata !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 16)), !dbg [[DBG7]] +; CHECK-NEXT: #dbg_value(i16 undef, [[META9]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 16), [[META7]]) ; CHECK-NEXT: [[PTR_SROA_21_0_EXTRACT_SHIFT:%.*]] = lshr i32 [[ARG1]], 24 ; CHECK-NEXT: [[PTR_SROA_21_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[PTR_SROA_21_0_EXTRACT_SHIFT]] to i8 -; CHECK-NEXT: tail call void @llvm.dbg.value(metadata i8 undef, metadata [[META8]], metadata !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 8)), !dbg [[DBG7]] -; CHECK-NEXT: tail call void @llvm.dbg.value(metadata i8 undef, metadata [[META8]], metadata !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 8)), !dbg [[DBG7]] +; CHECK-NEXT: #dbg_value(i8 undef, [[META8]], !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 8), [[META7]]) +; CHECK-NEXT: #dbg_value(i8 undef, [[META8]], !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 8), [[META7]]) ; CHECK-NEXT: ret i8 [[PTR_SROA_0_0_EXTRACT_TRUNC]] ; entry: @@ -196,7 +196,7 @@ entry: ; CHECK: [[META4]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META5:![0-9]+]], isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug) ; CHECK: [[META5]] = !DIFile(filename: "dbg-bit-piece.cpp", directory: "") ; CHECK: [[META6]] = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned) -; CHECK: [[DBG7]] = !DILocation(line: 0, scope: [[META3]]) +; CHECK: [[META7]] = !DILocation(line: 0, scope: [[META3]]) ; CHECK: [[META8]] = !DILocalVariable(name: "z", scope: [[META3]], type: [[META6]]) ; CHECK: [[META9]] = !DILocalVariable(name: "y", scope: [[META3]], type: [[META10:![0-9]+]]) ; CHECK: [[META10]] = !DIBasicType(name: "signed int", size: 32, encoding: DW_ATE_signed) diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll index 3868f7bbc037a3..533f0f2691045b 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll @@ -4418,99 +4418,15 @@ define double @test_atomicrmw_fadd_f64_region_ret(ptr addrspace(2) %ptr, double } define <2 x half> @test_atomicrmw_fadd_v2f16_flat_agent(ptr %ptr, <2 x half> %value) { -; ALL-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent( -; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret <2 x half> [[TMP5]] -; - %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst - ret <2 x half> %res -} - -define void @test_atomicrmw_fadd_v2f16_flat_agent_noret(ptr %ptr, <2 x half> %value) { -; ALL-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret( -; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret void -; - %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst - ret void -} - -define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(ptr addrspace(1) %ptr, <2 x half> %value) { -; ALL-LABEL: @test_atomicrmw_fadd_v2f16_global_agent( -; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret <2 x half> [[TMP5]] -; - %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst - ret <2 x half> %res -} - -define void @test_atomicrmw_fadd_v2f16_flat_global_noret(ptr addrspace(1) %ptr, <2 x half> %value) { -; ALL-LABEL: @test_atomicrmw_fadd_v2f16_flat_global_noret( -; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret void -; - %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst - ret void -} - -define <2 x half> @test_atomicrmw_fadd_v2f16_local_agent(ptr addrspace(3) %ptr, <2 x half> %value) { -; CI-LABEL: @test_atomicrmw_fadd_v2f16_local_agent( -; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; CI-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent( +; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: ; CI-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] ; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] ; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -4518,15 +4434,15 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_local_agent(ptr addrspace(3) %ptr, ; CI: atomicrmw.end: ; CI-NEXT: ret <2 x half> [[TMP5]] ; -; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_local_agent( -; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 ; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX9: atomicrmw.start: ; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] ; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] ; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -4534,15 +4450,15 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_local_agent(ptr addrspace(3) %ptr, ; GFX9: atomicrmw.end: ; GFX9-NEXT: ret <2 x half> [[TMP5]] ; -; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_local_agent( -; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 ; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX908: atomicrmw.start: ; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -4550,15 +4466,15 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_local_agent(ptr addrspace(3) %ptr, ; GFX908: atomicrmw.end: ; GFX908-NEXT: ret <2 x half> [[TMP5]] ; -; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_local_agent( -; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent( +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 ; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX90A: atomicrmw.start: ; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -4566,19 +4482,19 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_local_agent(ptr addrspace(3) %ptr, ; GFX90A: atomicrmw.end: ; GFX90A-NEXT: ret <2 x half> [[TMP5]] ; -; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_local_agent( -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 ; GFX940-NEXT: ret <2 x half> [[RES]] ; -; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_local_agent( -; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 ; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX11: atomicrmw.start: ; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -4586,20 +4502,20 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_local_agent(ptr addrspace(3) %ptr, ; GFX11: atomicrmw.end: ; GFX11-NEXT: ret <2 x half> [[TMP5]] ; - %res = atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %value syncscope("agent") seq_cst + %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst ret <2 x half> %res } -define void @test_atomicrmw_fadd_v2f16_flat_local_noret(ptr addrspace(3) %ptr, <2 x half> %value) { -; CI-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret( -; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +define void @test_atomicrmw_fadd_v2f16_flat_agent_noret(ptr %ptr, <2 x half> %value) { +; CI-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret( +; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: ; CI-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] ; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] ; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -4607,15 +4523,15 @@ define void @test_atomicrmw_fadd_v2f16_flat_local_noret(ptr addrspace(3) %ptr, < ; CI: atomicrmw.end: ; CI-NEXT: ret void ; -; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret( -; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 ; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX9: atomicrmw.start: ; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] ; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] ; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -4623,15 +4539,15 @@ define void @test_atomicrmw_fadd_v2f16_flat_local_noret(ptr addrspace(3) %ptr, < ; GFX9: atomicrmw.end: ; GFX9-NEXT: ret void ; -; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret( -; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 ; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX908: atomicrmw.start: ; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -4639,15 +4555,15 @@ define void @test_atomicrmw_fadd_v2f16_flat_local_noret(ptr addrspace(3) %ptr, < ; GFX908: atomicrmw.end: ; GFX908-NEXT: ret void ; -; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret( -; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret( +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 ; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX90A: atomicrmw.start: ; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -4655,19 +4571,19 @@ define void @test_atomicrmw_fadd_v2f16_flat_local_noret(ptr addrspace(3) %ptr, < ; GFX90A: atomicrmw.end: ; GFX90A-NEXT: ret void ; -; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret( -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 ; GFX940-NEXT: ret void ; -; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret( -; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 ; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX11: atomicrmw.start: ; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -4675,225 +4591,1339 @@ define void @test_atomicrmw_fadd_v2f16_flat_local_noret(ptr addrspace(3) %ptr, < ; GFX11: atomicrmw.end: ; GFX11-NEXT: ret void ; - %res = atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %value syncscope("agent") seq_cst - ret void -} - -define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_flat_agent(ptr %ptr, <2 x bfloat> %value) { -; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent( -; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret <2 x bfloat> [[TMP5]] -; - %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst - ret <2 x bfloat> %res -} - -define void @test_atomicrmw_fadd_v2bf16_flat_agent_noret(ptr %ptr, <2 x bfloat> %value) { -; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret( -; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret void -; - %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst - ret void -} - -define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent( -; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret <2 x bfloat> [[TMP5]] -; - %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst - ret <2 x bfloat> %res -} - -define void @test_atomicrmw_fadd_v2bf16_flat_global_noret(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_flat_global_noret( -; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret void -; - %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst ret void } -define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_local_agent(ptr addrspace(3) %ptr, <2 x bfloat> %value) { -; CI-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent( -; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(ptr addrspace(1) %ptr, <2 x half> %value) { +; CI-LABEL: @test_atomicrmw_fadd_v2f16_global_agent( +; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: -; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] -; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> ; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; CI: atomicrmw.end: -; CI-NEXT: ret <2 x bfloat> [[TMP5]] +; CI-NEXT: ret <2 x half> [[TMP5]] ; -; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent( -; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_global_agent( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX9: atomicrmw.start: -; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] -; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> ; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX9: atomicrmw.end: -; GFX9-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX9-NEXT: ret <2 x half> [[TMP5]] ; -; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent( -; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_global_agent( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX908: atomicrmw.start: -; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] -; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> ; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX908: atomicrmw.end: -; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX908-NEXT: ret <2 x half> [[TMP5]] ; -; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent( -; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 -; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX90A: atomicrmw.start: -; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] -; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX90A: atomicrmw.end: -; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_global_agent( +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX90A-NEXT: ret <2 x half> [[RES]] ; -; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent( -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 -; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_global_agent( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret <2 x half> [[RES]] ; -; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent( -; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_global_agent( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX11: atomicrmw.start: -; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] -; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> ; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX11: atomicrmw.end: -; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX11-NEXT: ret <2 x half> [[TMP5]] ; - %res = atomicrmw fadd ptr addrspace(3) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst - ret <2 x bfloat> %res + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst + ret <2 x half> %res } -define void @test_atomicrmw_fadd_v2bf16_flat_local_noret(ptr addrspace(3) %ptr, <2 x bfloat> %value) { -; CI-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret( -; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +define void @test_atomicrmw_fadd_v2f16_flat_global_noret(ptr addrspace(1) %ptr, <2 x half> %value) { +; CI-LABEL: @test_atomicrmw_fadd_v2f16_flat_global_noret( +; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: -; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] -; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> ; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; CI: atomicrmw.end: ; CI-NEXT: ret void ; -; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret( -; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_flat_global_noret( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX9: atomicrmw.start: -; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] -; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> ; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX9: atomicrmw.end: ; GFX9-NEXT: ret void ; -; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret( -; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_flat_global_noret( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX908: atomicrmw.start: -; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_flat_global_noret( +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_flat_global_noret( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_flat_global_noret( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst + ret void +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_local_agent(ptr addrspace(3) %ptr, <2 x half> %value) { +; CI-LABEL: @test_atomicrmw_fadd_v2f16_local_agent( +; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret <2 x half> [[TMP5]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_local_agent( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_local_agent( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_local_agent( +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x half> [[TMP5]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_local_agent( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_local_agent( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %value syncscope("agent") seq_cst + ret <2 x half> %res +} + +define void @test_atomicrmw_fadd_v2f16_flat_local_noret(ptr addrspace(3) %ptr, <2 x half> %value) { +; CI-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret( +; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret( +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %value syncscope("agent") seq_cst + ret void +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_flat_agent(ptr %ptr, <2 x bfloat> %value) { +; CI-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent( +; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent( +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + ret <2 x bfloat> %res +} + +define void @test_atomicrmw_fadd_v2bf16_flat_agent_noret(ptr %ptr, <2 x bfloat> %value) { +; CI-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret( +; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret( +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; + %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + ret void +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; CI-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent( +; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent( +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + ret <2 x bfloat> %res +} + +define void @test_atomicrmw_fadd_v2bf16_flat_global_noret(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; CI-LABEL: @test_atomicrmw_fadd_v2bf16_flat_global_noret( +; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_flat_global_noret( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_flat_global_noret( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_flat_global_noret( +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_flat_global_noret( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_flat_global_noret( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + ret void +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_local_agent(ptr addrspace(3) %ptr, <2 x bfloat> %value) { +; CI-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent( +; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent( +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(3) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + ret <2 x bfloat> %res +} + +define void @test_atomicrmw_fadd_v2bf16_flat_local_noret(ptr addrspace(3) %ptr, <2 x bfloat> %value) { +; CI-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret( +; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret( +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(3) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + ret void +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_flat_agent__unsafe(ptr %ptr, <2 x half> %value) #6 { +; CI-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent__unsafe( +; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret <2 x half> [[TMP5]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent__unsafe( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent__unsafe( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent__unsafe( +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x half> [[TMP5]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent__unsafe( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent__unsafe( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst + ret <2 x half> %res +} + +define void @test_atomicrmw_fadd_v2f16_flat_agent_noret__unsafe(ptr %ptr, <2 x half> %value) #6 { +; CI-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret__unsafe( +; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret__unsafe( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret__unsafe( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret__unsafe( +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret__unsafe( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret__unsafe( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; + %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst + ret void +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__unsafe(ptr addrspace(1) %ptr, <2 x half> %value) #6 { +; CI-LABEL: @test_atomicrmw_fadd_v2f16_global_agent__unsafe( +; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret <2 x half> [[TMP5]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_global_agent__unsafe( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_global_agent__unsafe( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_global_agent__unsafe( +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_global_agent__unsafe( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_global_agent__unsafe( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst + ret <2 x half> %res +} + +define void @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(ptr addrspace(1) %ptr, <2 x half> %value) #6 { +; CI-LABEL: @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe( +; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe( +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst + ret void +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe(ptr %ptr, <2 x bfloat> %value) #6 { +; CI-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe( +; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe( +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + ret <2 x bfloat> %res +} + +define void @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe(ptr %ptr, <2 x bfloat> %value) #6 { +; CI-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe( +; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -4901,15 +5931,15 @@ define void @test_atomicrmw_fadd_v2bf16_flat_local_noret(ptr addrspace(3) %ptr, ; GFX908: atomicrmw.end: ; GFX908-NEXT: ret void ; -; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret( -; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe( +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 ; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX90A: atomicrmw.start: ; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -4917,19 +5947,19 @@ define void @test_atomicrmw_fadd_v2bf16_flat_local_noret(ptr addrspace(3) %ptr, ; GFX90A: atomicrmw.end: ; GFX90A-NEXT: ret void ; -; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret( -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 ; GFX940-NEXT: ret void ; -; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret( -; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 ; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX11: atomicrmw.start: ; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -4937,173 +5967,183 @@ define void @test_atomicrmw_fadd_v2bf16_flat_local_noret(ptr addrspace(3) %ptr, ; GFX11: atomicrmw.end: ; GFX11-NEXT: ret void ; - %res = atomicrmw fadd ptr addrspace(3) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst ret void } -define <2 x half> @test_atomicrmw_fadd_v2f16_flat_agent__unsafe(ptr %ptr, <2 x half> %value) #6 { -; ALL-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent__unsafe( -; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret <2 x half> [[TMP5]] -; - %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst - ret <2 x half> %res -} - -define void @test_atomicrmw_fadd_v2f16_flat_agent_noret__unsafe(ptr %ptr, <2 x half> %value) #6 { -; ALL-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret__unsafe( -; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret void +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__unsafe(ptr addrspace(1) %ptr, <2 x bfloat> %value) #6 { +; CI-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent__unsafe( +; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret <2 x bfloat> [[TMP5]] ; - %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst - ret void -} - -define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__unsafe(ptr addrspace(1) %ptr, <2 x half> %value) #6 { -; ALL-LABEL: @test_atomicrmw_fadd_v2f16_global_agent__unsafe( -; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret <2 x half> [[TMP5]] +; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent__unsafe( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret <2 x bfloat> [[TMP5]] ; - %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst - ret <2 x half> %res -} - -define void @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(ptr addrspace(1) %ptr, <2 x half> %value) #6 { -; ALL-LABEL: @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe( -; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret void +; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent__unsafe( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] ; - %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst - ret void -} - -define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe(ptr %ptr, <2 x bfloat> %value) #6 { -; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe( -; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent__unsafe( +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] ; - %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst - ret <2 x bfloat> %res -} - -define void @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe(ptr %ptr, <2 x bfloat> %value) #6 { -; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe( -; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret void +; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent__unsafe( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret <2 x bfloat> [[RES]] ; - %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst - ret void -} - -define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__unsafe(ptr addrspace(1) %ptr, <2 x bfloat> %value) #6 { -; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent__unsafe( -; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent__unsafe( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst ret <2 x bfloat> %res } define void @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe(ptr addrspace(1) %ptr, <2 x bfloat> %value) #6 { -; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe( -; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret void +; CI-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe( +; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe( +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst ret void diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll index 4f053af7aed932..26b84f82524e26 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll @@ -41,264 +41,1404 @@ ;--------------------------------------------------------------------- define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(ptr addrspace(1) %ptr, <2 x bfloat> %value) #0 { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(ptr addrspace(1) %ptr, <2 x bfloat> %value) #1 { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, <2 x bfloat> %value) #0 { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, <2 x bfloat> %value) #1 { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x bfloat> %res @@ -310,7 +1450,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent(ptr addrspace(1) %ptr, <2 x bfloat> %value) { ; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { ; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 ; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] ; COMMON: atomicrmw.start: @@ -848,12 +1988,8 @@ attributes #0 = { "denormal-fp-mode"="preserve-sign,preserve-sign" } attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" } !0 = !{} -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX10: {{.*}} -; GFX11: {{.*}} -; GFX12: {{.*}} -; GFX803: {{.*}} -; GFX906: {{.*}} -; GFX908: {{.*}} -; GFX90A: {{.*}} -; GFX940: {{.*}} +;. +; GFX940: [[META0]] = !{} +;. +; GFX12: [[META0]] = !{} +;. diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll index 0339e885ca7700..d01cfbe035f64c 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll @@ -41,264 +41,1404 @@ ;--------------------------------------------------------------------- define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4 +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4 +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.remote.memory !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(ptr addrspace(1) %ptr, <2 x bfloat> %value) #0 { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(ptr addrspace(1) %ptr, <2 x bfloat> %value) #1 { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, <2 x bfloat> %value) #0 { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, <2 x bfloat> %value) #1 { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x bfloat> %res @@ -310,7 +1450,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system(ptr addrspace(1) %ptr, <2 x bfloat> %value) { ; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { ; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 ; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] ; COMMON: atomicrmw.start: @@ -848,12 +1988,8 @@ attributes #0 = { "denormal-fp-mode"="preserve-sign,preserve-sign" } attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" } !0 = !{} -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX10: {{.*}} -; GFX11: {{.*}} -; GFX12: {{.*}} -; GFX803: {{.*}} -; GFX906: {{.*}} -; GFX908: {{.*}} -; GFX90A: {{.*}} -; GFX940: {{.*}} +;. +; GFX940: [[META0]] = !{} +;. +; GFX12: [[META0]] = !{} +;. diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll index 0fc6f1134ff406..3b0b7c9209588b 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll @@ -41,264 +41,1260 @@ ;--------------------------------------------------------------------- define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(ptr addrspace(1) %ptr, <2 x half> %value) { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(ptr addrspace(1) %ptr, <2 x half> %value) #0 { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(ptr addrspace(1) %ptr, <2 x half> %value) #1 { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, <2 x half> %value) #0 { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, <2 x half> %value) #1 { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x half> %res @@ -310,7 +1306,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent(ptr addrspace(1) %ptr, <2 x half> %value) { ; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { ; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 ; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] ; COMMON: atomicrmw.start: @@ -848,12 +1844,10 @@ attributes #0 = { "denormal-fp-mode"="preserve-sign,preserve-sign" } attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" } !0 = !{} -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX10: {{.*}} -; GFX11: {{.*}} -; GFX12: {{.*}} -; GFX803: {{.*}} -; GFX906: {{.*}} -; GFX908: {{.*}} -; GFX90A: {{.*}} -; GFX940: {{.*}} +;. +; GFX90A: [[META0]] = !{} +;. +; GFX940: [[META0]] = !{} +;. +; GFX12: [[META0]] = !{} +;. diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll index 20f39b5ed0c860..3478f23e0b200d 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll @@ -41,264 +41,1260 @@ ;--------------------------------------------------------------------- define <2 x half> @test_atomicrmw_fadd_v2f16_global_system(ptr addrspace(1) %ptr, <2 x half> %value) { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4 +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4 +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4 +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.remote.memory !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(ptr addrspace(1) %ptr, <2 x half> %value) #0 { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(ptr addrspace(1) %ptr, <2 x half> %value) #1 { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, <2 x half> %value) #0 { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, <2 x half> %value) #1 { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x half> %res @@ -310,7 +1306,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm define <2 x half> @test_atomicrmw_fsub_v2f16_global_system(ptr addrspace(1) %ptr, <2 x half> %value) { ; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { ; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 ; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] ; COMMON: atomicrmw.start: @@ -848,12 +1844,10 @@ attributes #0 = { "denormal-fp-mode"="preserve-sign,preserve-sign" } attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" } !0 = !{} -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX10: {{.*}} -; GFX11: {{.*}} -; GFX12: {{.*}} -; GFX803: {{.*}} -; GFX906: {{.*}} -; GFX908: {{.*}} -; GFX90A: {{.*}} -; GFX940: {{.*}} +;. +; GFX90A: [[META0]] = !{} +;. +; GFX940: [[META0]] = !{} +;. +; GFX12: [[META0]] = !{} +;. diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-fp-vector.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-fp-vector.ll index 1411890e01dc59..4556ad60ccf8ca 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-fp-vector.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-fp-vector.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand -mcpu=gfx900 %s | FileCheck %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand -mcpu=gfx90a %s | FileCheck %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand -mcpu=gfx940 %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand -mcpu=gfx900 %s | FileCheck -check-prefixes=CHECK,GFX900 %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand -mcpu=gfx90a %s | FileCheck -check-prefixes=CHECK,GFX90A %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand -mcpu=gfx940 %s | FileCheck -check-prefixes=CHECK,GFX940 %s ;--------------------------------------------------------------------- ; atomicrmw fadd @@ -70,44 +70,76 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align2(ptr addrspac } define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align4(ptr addrspace(1) %ptr, <2 x half> %value) { -; CHECK-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align4( -; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] -; CHECK: atomicrmw.start: -; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; CHECK-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; CHECK-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; CHECK: atomicrmw.end: -; CHECK-NEXT: ret <2 x half> [[TMP5]] +; GFX900-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align4( +; GFX900-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX900-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX900: atomicrmw.start: +; GFX900-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX900-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX900-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX900-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX900-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX900: atomicrmw.end: +; GFX900-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align4( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align4( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4 ret <2 x half> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align4(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; CHECK-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align4( -; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] -; CHECK: atomicrmw.start: -; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; CHECK-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; CHECK-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; CHECK: atomicrmw.end: -; CHECK-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX900-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align4( +; GFX900-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX900-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX900: atomicrmw.start: +; GFX900-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX900-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX900-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX900-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX900-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX900: atomicrmw.end: +; GFX900-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align4( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align4( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4 ret <2 x bfloat> %res diff --git a/llvm/test/Transforms/InstCombine/icmp-add.ll b/llvm/test/Transforms/InstCombine/icmp-add.ll index 6b4e5a5372c523..baa6f3d51a40ef 100644 --- a/llvm/test/Transforms/InstCombine/icmp-add.ll +++ b/llvm/test/Transforms/InstCombine/icmp-add.ll @@ -3023,4 +3023,82 @@ define i1 @icmp_addnuw_nonzero_fail_multiuse(i32 %x, i32 %y) { ret i1 %c } +define i1 @ult_add_C2_pow2_C_neg(i8 %x) { +; CHECK-LABEL: @ult_add_C2_pow2_C_neg( +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X:%.*]], -32 +; CHECK-NEXT: [[C:%.*]] = icmp ne i8 [[TMP1]], -64 +; CHECK-NEXT: ret i1 [[C]] +; + %i = add i8 %x, 32 + %c = icmp ult i8 %i, -32 + ret i1 %c +} + +define i1 @ult_add_nsw_C2_pow2_C_neg(i8 %x) { +; CHECK-LABEL: @ult_add_nsw_C2_pow2_C_neg( +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X:%.*]], -32 +; CHECK-NEXT: [[C:%.*]] = icmp ne i8 [[TMP1]], -64 +; CHECK-NEXT: ret i1 [[C]] +; + %i = add nsw i8 %x, 32 + %c = icmp ult i8 %i, -32 + ret i1 %c +} + +define i1 @ult_add_nuw_nsw_C2_pow2_C_neg(i8 %x) { +; CHECK-LABEL: @ult_add_nuw_nsw_C2_pow2_C_neg( +; CHECK-NEXT: [[C:%.*]] = icmp ult i8 [[X:%.*]], -64 +; CHECK-NEXT: ret i1 [[C]] +; + %i = add nuw nsw i8 %x, 32 + %c = icmp ult i8 %i, -32 + ret i1 %c +} + +define i1 @ult_add_C2_neg_C_pow2(i8 %x) { +; CHECK-LABEL: @ult_add_C2_neg_C_pow2( +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X:%.*]], -32 +; CHECK-NEXT: [[C:%.*]] = icmp eq i8 [[TMP1]], 32 +; CHECK-NEXT: ret i1 [[C]] +; + %i = add i8 %x, -32 + %c = icmp ult i8 %i, 32 + ret i1 %c +} + +define <2 x i1> @ult_add_C2_pow2_C_neg_vec(<2 x i8> %x) { +; CHECK-LABEL: @ult_add_C2_pow2_C_neg_vec( +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i8> [[X:%.*]], +; CHECK-NEXT: [[C:%.*]] = icmp ne <2 x i8> [[TMP1]], +; CHECK-NEXT: ret <2 x i1> [[C]] +; + %i = add <2 x i8> %x, + %c = icmp ult <2 x i8> %i, + ret <2 x i1> %c +} + +define i1 @ult_add_C2_pow2_C_neg_multiuse(i8 %x) { +; CHECK-LABEL: @ult_add_C2_pow2_C_neg_multiuse( +; CHECK-NEXT: [[I:%.*]] = add i8 [[X:%.*]], 32 +; CHECK-NEXT: [[C:%.*]] = icmp ult i8 [[I]], -32 +; CHECK-NEXT: call void @use(i8 [[I]]) +; CHECK-NEXT: ret i1 [[C]] +; + %i = add i8 %x, 32 + %c = icmp ult i8 %i, -32 + call void @use(i8 %i) + ret i1 %c +} + +define i1 @uge_add_C2_pow2_C_neg(i8 %x) { +; CHECK-LABEL: @uge_add_C2_pow2_C_neg( +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X:%.*]], -32 +; CHECK-NEXT: [[C:%.*]] = icmp eq i8 [[TMP1]], -64 +; CHECK-NEXT: ret i1 [[C]] +; + %i = add i8 %x, 32 + %c = icmp uge i8 %i, -32 + ret i1 %c +} + declare void @llvm.assume(i1) diff --git a/llvm/test/Transforms/InstCombine/trunc.ll b/llvm/test/Transforms/InstCombine/trunc.ll index a180c3d52f6ab1..0956fc655ec9a7 100644 --- a/llvm/test/Transforms/InstCombine/trunc.ll +++ b/llvm/test/Transforms/InstCombine/trunc.ll @@ -5,6 +5,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; Instcombine should be able to eliminate all of these ext casts. declare void @use(i32) +declare void @use.i8(i8) declare void @use_vec(<2 x i32>) define i64 @test1(i64 %a) { @@ -217,8 +218,8 @@ define i16 @ashr_mul(i8 %X, i8 %Y) { define i32 @trunc_ashr(i32 %X) { ; CHECK-LABEL: @trunc_ashr( ; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X:%.*]], 8 -; CHECK-NEXT: [[C:%.*]] = or i32 [[TMP1]], -8388608 -; CHECK-NEXT: ret i32 [[C]] +; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], -8388608 +; CHECK-NEXT: ret i32 [[TMP2]] ; %A = zext i32 %X to i36 %B = or i36 %A, -2147483648 ; 0xF80000000 @@ -230,8 +231,8 @@ define i32 @trunc_ashr(i32 %X) { define <2 x i32> @trunc_ashr_vec(<2 x i32> %X) { ; CHECK-LABEL: @trunc_ashr_vec( ; CHECK-NEXT: [[TMP1:%.*]] = lshr <2 x i32> [[X:%.*]], -; CHECK-NEXT: [[C:%.*]] = or <2 x i32> [[TMP1]], -; CHECK-NEXT: ret <2 x i32> [[C]] +; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i32> [[TMP1]], +; CHECK-NEXT: ret <2 x i32> [[TMP2]] ; %A = zext <2 x i32> %X to <2 x i36> %B = or <2 x i36> %A, ; 0xF80000000 @@ -305,8 +306,8 @@ define <2 x i64> @test8_vec_poison(<2 x i32> %A, <2 x i32> %B) { ; CHECK-NEXT: [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i64> ; CHECK-NEXT: [[D:%.*]] = zext <2 x i32> [[B:%.*]] to <2 x i64> ; CHECK-NEXT: [[E:%.*]] = shl nuw <2 x i64> [[D]], -; CHECK-NEXT: [[G:%.*]] = or disjoint <2 x i64> [[E]], [[C]] -; CHECK-NEXT: ret <2 x i64> [[G]] +; CHECK-NEXT: [[F:%.*]] = or disjoint <2 x i64> [[E]], [[C]] +; CHECK-NEXT: ret <2 x i64> [[F]] ; %C = zext <2 x i32> %A to <2 x i128> %D = zext <2 x i32> %B to <2 x i128> @@ -392,8 +393,8 @@ define <2 x i64> @test11_vec_poison(<2 x i32> %A, <2 x i32> %B) { ; CHECK-NEXT: [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i64> ; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[B:%.*]], ; CHECK-NEXT: [[E:%.*]] = zext nneg <2 x i32> [[TMP1]] to <2 x i64> -; CHECK-NEXT: [[G:%.*]] = shl nuw nsw <2 x i64> [[C]], [[E]] -; CHECK-NEXT: ret <2 x i64> [[G]] +; CHECK-NEXT: [[F:%.*]] = shl nuw nsw <2 x i64> [[C]], [[E]] +; CHECK-NEXT: ret <2 x i64> [[F]] ; %C = zext <2 x i32> %A to <2 x i128> %D = zext <2 x i32> %B to <2 x i128> @@ -456,8 +457,8 @@ define <2 x i64> @test12_vec_poison(<2 x i32> %A, <2 x i32> %B) { ; CHECK-NEXT: [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i64> ; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[B:%.*]], ; CHECK-NEXT: [[E:%.*]] = zext nneg <2 x i32> [[TMP1]] to <2 x i64> -; CHECK-NEXT: [[G:%.*]] = lshr <2 x i64> [[C]], [[E]] -; CHECK-NEXT: ret <2 x i64> [[G]] +; CHECK-NEXT: [[F:%.*]] = lshr <2 x i64> [[C]], [[E]] +; CHECK-NEXT: ret <2 x i64> [[F]] ; %C = zext <2 x i32> %A to <2 x i128> %D = zext <2 x i32> %B to <2 x i128> @@ -520,8 +521,8 @@ define <2 x i64> @test13_vec_poison(<2 x i32> %A, <2 x i32> %B) { ; CHECK-NEXT: [[C:%.*]] = sext <2 x i32> [[A:%.*]] to <2 x i64> ; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[B:%.*]], ; CHECK-NEXT: [[E:%.*]] = zext nneg <2 x i32> [[TMP1]] to <2 x i64> -; CHECK-NEXT: [[G:%.*]] = ashr <2 x i64> [[C]], [[E]] -; CHECK-NEXT: ret <2 x i64> [[G]] +; CHECK-NEXT: [[F:%.*]] = ashr <2 x i64> [[C]], [[E]] +; CHECK-NEXT: ret <2 x i64> [[F]] ; %C = sext <2 x i32> %A to <2 x i128> %D = zext <2 x i32> %B to <2 x i128> @@ -1034,8 +1035,8 @@ define i8 @drop_nsw_trunc(i16 %x, i16 %y) { define i8 @drop_nuw_trunc(i16 %x, i16 %y) { ; CHECK-LABEL: @drop_nuw_trunc( ; CHECK-NEXT: [[AND2:%.*]] = and i16 [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[B:%.*]] = trunc i16 [[AND2]] to i8 -; CHECK-NEXT: ret i8 [[B]] +; CHECK-NEXT: [[RES:%.*]] = trunc i16 [[AND2]] to i8 +; CHECK-NEXT: ret i8 [[RES]] ; %and = and i16 %x, 255 %and2 = and i16 %and, %y @@ -1095,3 +1096,31 @@ define <2 x i1> @trunc_nuw_xor_vector(<2 x i8> %x, <2 x i8> %y) { %r = trunc nuw <2 x i8> %xor to <2 x i1> ret <2 x i1> %r } + +define void @pr95547(i32 %x) { +; CHECK-LABEL: @pr95547( +; CHECK-NEXT: [[X_TRUNC:%.*]] = trunc i32 [[X:%.*]] to i16 +; CHECK-NEXT: [[DIV:%.*]] = udiv i16 11, [[X_TRUNC]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[X]], 256 +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP:%.*]], label [[EXIT:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[TRUNC:%.*]] = trunc nuw nsw i16 [[DIV]] to i8 +; CHECK-NEXT: call void @use.i8(i8 [[TRUNC]]) +; CHECK-NEXT: br label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret void +; + %x.trunc = trunc i32 %x to i16 + %div = udiv i16 11, %x.trunc + %cmp = icmp ult i32 %x, 256 + br i1 %cmp, label %loop, label %exit + +loop: + ; The loop is just here to prevent sinking. + %trunc = trunc i16 %div to i8 + call void @use.i8(i8 %trunc) + br label %loop + +exit: + ret void +} diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching-LCS.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching-LCS.ll index ecf8484d98e59d..4b8cd853301ed1 100644 --- a/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching-LCS.ll +++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching-LCS.ll @@ -1,6 +1,7 @@ ; REQUIRES: x86_64-linux ; REQUIRES: asserts ; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-stale-profile-matching-LCS.prof --salvage-stale-profile -S --debug-only=sample-profile,sample-profile-matcher,sample-profile-impl 2>&1 | FileCheck %s +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-stale-profile-matching-LCS.prof --salvage-stale-profile -S --debug-only=sample-profile,sample-profile-matcher,sample-profile-impl --salvage-stale-profile-max-callsites=6 2>&1 | FileCheck %s -check-prefix=CHECK-MAX-CALLSITES ; CHECK: Run stale profile matching for test_direct_call ; CHECK: Location is matched from 1 to 1 @@ -27,6 +28,8 @@ ; CHECK: Callsite with callee:unknown.indirect.callee is matched from 9 to 6 ; CHECK: Callsite with callee:C is matched from 10 to 7 +; CHECK-MAX-CALLSITES: Skip stale profile matching for test_direct_call +; CHECK-MAX-CALLSITES-NOT: Skip stale profile matching for test_indirect_call target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll index c020aeb6483f5c..9ad042c112b4c0 100644 --- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll @@ -202,6 +202,19 @@ define <8 x i8> @abs_different(<8 x i8> %a) { ret <8 x i8> %r } +define <4 x i32> @poison_intrinsic(<2 x i16> %l256) { +; CHECK-LABEL: @poison_intrinsic( +; CHECK-NEXT: [[L266:%.*]] = call <2 x i16> @llvm.abs.v2i16(<2 x i16> [[L256:%.*]], i1 false) +; CHECK-NEXT: [[L267:%.*]] = zext <2 x i16> [[L266]] to <2 x i32> +; CHECK-NEXT: [[L271:%.*]] = shufflevector <2 x i32> [[L267]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[L271]] +; + %l266 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %l256, i1 false) + %l267 = zext <2 x i16> %l266 to <2 x i32> + %l271 = shufflevector <2 x i32> %l267, <2 x i32> poison, <4 x i32> + ret <4 x i32> %l271 +} + define <8 x half> @splat0(<8 x half> %a, <8 x half> %b) { ; CHECK-LABEL: @splat0( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x half> [[B:%.*]], <8 x half> poison, <8 x i32> zeroinitializer diff --git a/llvm/utils/release/build_llvm_release.bat b/llvm/utils/release/build_llvm_release.bat index dc63fdac1e4adc..cef7c8661b7167 100755 --- a/llvm/utils/release/build_llvm_release.bat +++ b/llvm/utils/release/build_llvm_release.bat @@ -194,7 +194,7 @@ set cmake_flags=^ -DLLDB_TEST_COMPILER=%stage0_bin_dir%/clang.exe ^ -DPYTHON_HOME=%PYTHONHOME% ^ -DPython3_ROOT_DIR=%PYTHONHOME% ^ - -DLIBXML2_INCLUDE_DIRS=%libxmldir%/include/libxml2 ^ + -DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^ -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib cmake -GNinja %cmake_flags% %llvm_src%\llvm || exit /b 1 @@ -250,7 +250,7 @@ set cmake_flags=^ -DLLDB_TEST_COMPILER=%stage0_bin_dir%/clang.exe ^ -DPYTHON_HOME=%PYTHONHOME% ^ -DPython3_ROOT_DIR=%PYTHONHOME% ^ - -DLIBXML2_INCLUDE_DIRS=%libxmldir%/include/libxml2 ^ + -DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^ -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib cmake -GNinja %cmake_flags% %llvm_src%\llvm || exit /b 1 @@ -317,7 +317,7 @@ set "stage0_bin_dir=%build_dir%/build_arm64_stage0/bin" set cmake_flags=^ %common_cmake_flags% ^ -DCLANG_DEFAULT_LINKER=lld ^ - -DLIBXML2_INCLUDE_DIRS=%libxmldir%/include/libxml2 ^ + -DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^ -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib ^ -DPython3_ROOT_DIR=%PYTHONHOME% ^ -DCOMPILER_RT_BUILD_PROFILE=OFF ^ diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp index 62887c75c872b4..f57be49f21b8c8 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp @@ -34,6 +34,20 @@ convertIterSpaceType(IterSpaceType itSp, SmallVectorImpl &fields) { return success(); } +static std::optional +convertIteratorType(IteratorType itTp, SmallVectorImpl &fields) { + // The actually Iterator Values (that are updated every iteration). + auto idxTp = IndexType::get(itTp.getContext()); + // TODO: handle batch dimension. + assert(itTp.getEncoding().getBatchLvlRank() == 0); + if (!itTp.isUnique()) { + // Segment high for non-unique iterator. + fields.push_back(idxTp); + } + fields.push_back(idxTp); + return success(); +} + namespace { /// Sparse codegen rule for number of entries operator. @@ -57,10 +71,114 @@ class ExtractIterSpaceConverter } }; +class SparseIterateOpConverter : public OneToNOpConversionPattern { +public: + using OneToNOpConversionPattern::OneToNOpConversionPattern; + LogicalResult + matchAndRewrite(IterateOp op, OpAdaptor adaptor, + OneToNPatternRewriter &rewriter) const override { + if (!op.getCrdUsedLvls().empty()) + return rewriter.notifyMatchFailure( + op, "non-empty coordinates list not implemented."); + + Location loc = op.getLoc(); + + auto iterSpace = SparseIterationSpace::fromValues( + op.getIterSpace().getType(), adaptor.getIterSpace(), 0); + + std::unique_ptr it = + iterSpace.extractIterator(rewriter, loc); + + if (it->iteratableByFor()) { + auto [lo, hi] = it->genForCond(rewriter, loc); + Value step = constantIndex(rewriter, loc, 1); + SmallVector ivs; + for (ValueRange inits : adaptor.getInitArgs()) + llvm::append_range(ivs, inits); + scf::ForOp forOp = rewriter.create(loc, lo, hi, step, ivs); + + Block *loopBody = op.getBody(); + OneToNTypeMapping bodyTypeMapping(loopBody->getArgumentTypes()); + if (failed(typeConverter->convertSignatureArgs( + loopBody->getArgumentTypes(), bodyTypeMapping))) + return failure(); + rewriter.applySignatureConversion(loopBody, bodyTypeMapping); + + forOp.getBody()->erase(); + Region &dstRegion = forOp.getRegion(); + rewriter.inlineRegionBefore(op.getRegion(), dstRegion, dstRegion.end()); + + auto yieldOp = + llvm::cast(forOp.getBody()->getTerminator()); + + rewriter.setInsertionPointToEnd(forOp.getBody()); + // replace sparse_tensor.yield with scf.yield. + rewriter.create(loc, yieldOp.getResults()); + yieldOp.erase(); + + const OneToNTypeMapping &resultMapping = adaptor.getResultMapping(); + rewriter.replaceOp(op, forOp.getResults(), resultMapping); + } else { + SmallVector ivs; + llvm::append_range(ivs, it->getCursor()); + for (ValueRange inits : adaptor.getInitArgs()) + llvm::append_range(ivs, inits); + + assert(llvm::all_of(ivs, [](Value v) { return v != nullptr; })); + + TypeRange types = ValueRange(ivs).getTypes(); + auto whileOp = rewriter.create(loc, types, ivs); + SmallVector l(types.size(), op.getIterator().getLoc()); + + // Generates loop conditions. + Block *before = rewriter.createBlock(&whileOp.getBefore(), {}, types, l); + rewriter.setInsertionPointToStart(before); + ValueRange bArgs = before->getArguments(); + auto [whileCond, remArgs] = it->genWhileCond(rewriter, loc, bArgs); + assert(remArgs.size() == adaptor.getInitArgs().size()); + rewriter.create(loc, whileCond, before->getArguments()); + + // Generates loop body. + Block *loopBody = op.getBody(); + OneToNTypeMapping bodyTypeMapping(loopBody->getArgumentTypes()); + if (failed(typeConverter->convertSignatureArgs( + loopBody->getArgumentTypes(), bodyTypeMapping))) + return failure(); + rewriter.applySignatureConversion(loopBody, bodyTypeMapping); + + Region &dstRegion = whileOp.getAfter(); + // TODO: handle uses of coordinate! + rewriter.inlineRegionBefore(op.getRegion(), dstRegion, dstRegion.end()); + ValueRange aArgs = whileOp.getAfterArguments(); + auto yieldOp = llvm::cast( + whileOp.getAfterBody()->getTerminator()); + + rewriter.setInsertionPointToEnd(whileOp.getAfterBody()); + + aArgs = it->linkNewScope(aArgs); + ValueRange nx = it->forward(rewriter, loc); + SmallVector yields; + llvm::append_range(yields, nx); + llvm::append_range(yields, yieldOp.getResults()); + + // replace sparse_tensor.yield with scf.yield. + yieldOp->erase(); + rewriter.create(loc, yields); + + const OneToNTypeMapping &resultMapping = adaptor.getResultMapping(); + rewriter.replaceOp( + op, whileOp.getResults().drop_front(it->getCursor().size()), + resultMapping); + } + return success(); + } +}; + } // namespace mlir::SparseIterationTypeConverter::SparseIterationTypeConverter() { addConversion([](Type type) { return type; }); + addConversion(convertIteratorType); addConversion(convertIterSpaceType); addSourceMaterialization([](OpBuilder &builder, IterSpaceType spTp, @@ -74,5 +192,6 @@ mlir::SparseIterationTypeConverter::SparseIterationTypeConverter() { void mlir::populateLowerSparseIterationToSCFPatterns( TypeConverter &converter, RewritePatternSet &patterns) { - patterns.add(converter, patterns.getContext()); + patterns.add( + converter, patterns.getContext()); } diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.cpp index be8e15d6ae6f4e..ef95fcc84bd907 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.cpp @@ -331,6 +331,13 @@ class TrivialIterator : public ConcreteIterator { TrivialIterator(const SparseTensorLevel &stl) : ConcreteIterator(stl, IterKind::kTrivial, /*itValCnt=*/1) {} + TrivialIterator(OpBuilder &b, Location l, const SparseTensorLevel &stl, + Value posLo, Value posHi) + : ConcreteIterator(stl, IterKind::kTrivial, /*itValCnt=*/1), posLo(posLo), + posHi(posHi) { + seek(posLo); + } + std::string getDebugInterfacePrefix() const override { return std::string("trivial<") + stl.toString() + ">"; } @@ -420,6 +427,14 @@ class DedupIterator : public ConcreteIterator { : ConcreteIterator(stl, IterKind::kDedup, /*itValCnt=*/2) { assert(!stl.isUnique()); } + + DedupIterator(OpBuilder &b, Location l, const SparseTensorLevel &stl, + Value posLo, Value posHi) + : ConcreteIterator(stl, IterKind::kDedup, /*itValCnt=*/2), posHi(posHi) { + assert(!stl.isUnique()); + seek({posLo, genSegmentHigh(b, l, posLo)}); + } + // For LLVM-style RTTI. static bool classof(const SparseIterator *from) { return from->kind == IterKind::kDedup; @@ -1532,6 +1547,11 @@ SparseIterationSpace mlir::sparse_tensor::SparseIterationSpace::fromValues( return space; } +std::unique_ptr +SparseIterationSpace::extractIterator(OpBuilder &b, Location l) const { + return makeSimpleIterator(b, l, *this); +} + //===----------------------------------------------------------------------===// // SparseIterator factory functions. //===----------------------------------------------------------------------===// @@ -1590,6 +1610,26 @@ sparse_tensor::makeSynLevelAndIterator(Value sz, unsigned tid, unsigned lvl, return std::make_pair(std::move(stl), std::move(it)); } +std::unique_ptr +sparse_tensor::makeSimpleIterator(OpBuilder &b, Location l, + const SparseIterationSpace &iterSpace) { + // assert(iterSpace.getSpaceDim() == 1); + std::unique_ptr ret; + if (!iterSpace.isUnique()) { + // We always dedupliate the non-unique level, but we should optimize it away + // if possible. + ret = std::make_unique(b, l, iterSpace.getLastLvl(), + iterSpace.getBoundLo(), + iterSpace.getBoundHi()); + } else { + ret = std::make_unique(b, l, iterSpace.getLastLvl(), + iterSpace.getBoundLo(), + iterSpace.getBoundHi()); + } + ret->setSparseEmitStrategy(SparseEmitStrategy::kFunctional); + return ret; +} + std::unique_ptr sparse_tensor::makeSimpleIterator(const SparseTensorLevel &stl, SparseEmitStrategy strategy) { diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.h index 17636af2b2f9d9..91f363db93f1df 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.h +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.h @@ -132,6 +132,10 @@ class SparseIterationSpace { Value getBoundLo() const { return bound.first; } Value getBoundHi() const { return bound.second; } + // Extract an iterator to iterate over the sparse iteration space. + std::unique_ptr extractIterator(OpBuilder &b, + Location l) const; + private: SmallVector> lvls; std::pair bound; @@ -192,6 +196,13 @@ class SparseIterator { crd = nullptr; } + // Reconstructs a iteration space directly from the provided ValueRange. + static std::unique_ptr + fromValues(IteratorType dstTp, ValueRange values, unsigned tid); + + // The inverse operation of `fromValues`. + SmallVector toValues() const { llvm_unreachable("Not implemented"); } + // // Iterator properties. // @@ -345,12 +356,21 @@ std::unique_ptr makeSparseTensorLevel(OpBuilder &b, unsigned tid, Level lvl); -/// Helper function to create a TensorLevel object from given `tensor`. +/// Helper function to create a TensorLevel object from given ValueRange. std::unique_ptr makeSparseTensorLevel(LevelType lt, Value sz, ValueRange buffers, unsigned tid, Level l); -/// Helper function to create a simple SparseIterator object that iterates -/// over the SparseTensorLevel. + +/// Helper function to create a simple SparseIterator object that iterate +/// over the entire iteration space. +std::unique_ptr +makeSimpleIterator(OpBuilder &b, Location l, + const SparseIterationSpace &iterSpace); + +/// Helper function to create a simple SparseIterator object that iterate +/// over the sparse tensor level. +/// TODO: switch to `SparseIterationSpace` (which support N-D iterator) when +/// feature complete. std::unique_ptr makeSimpleIterator( const SparseTensorLevel &stl, SparseEmitStrategy strategy = SparseEmitStrategy::kFunctional); diff --git a/mlir/test/Dialect/SparseTensor/sparse_iteration_to_scf.mlir b/mlir/test/Dialect/SparseTensor/sparse_iteration_to_scf.mlir index 5fcd661bb69b24..77a0e89dc7c81a 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_iteration_to_scf.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_iteration_to_scf.mlir @@ -1,4 +1,5 @@ // RUN: mlir-opt %s --lower-sparse-iteration-to-scf | FileCheck %s +// RUN: mlir-opt %s --sparse-space-collapse --lower-sparse-iteration-to-scf | FileCheck %s --check-prefix COLLAPSED #COO = #sparse_tensor.encoding<{ map = (i, j) -> ( @@ -7,17 +8,44 @@ ) }> -// CHECK-LABEL: func.func @sparse_1D_space( -// CHECK-SAME: %[[VAL_0:.*]]: tensor) -> !sparse_tensor.iter_space<#sparse{{[0-9]*}}, lvls = 0> { -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[LVL_SIZE:.*]] = sparse_tensor.lvl %[[VAL_0]], %[[C0]] : tensor -// CHECK: %[[POS_MEM:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor to memref -// CHECK: %[[CRD_MEM:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor to memref -// CHECK: %[[POS_LO:.*]] = memref.load %[[POS_MEM]]{{\[}}%[[C0]]] : memref -// CHECK: %[[POS_HI:.*]] = memref.load %[[POS_MEM]]{{\[}}%[[C1]]] : memref -// CHECK: %[[ITER_SPACE:.*]] = builtin.unrealized_conversion_cast %[[POS_MEM]], %[[CRD_MEM]], %[[LVL_SIZE]], %[[POS_LO]], %[[POS_HI]] -func.func @sparse_1D_space(%sp : tensor) -> !sparse_tensor.iter_space<#COO, lvls = 0> { - %l1 = sparse_tensor.extract_iteration_space %sp lvls = 0 : tensor -> !sparse_tensor.iter_space<#COO, lvls = 0> - return %l1 : !sparse_tensor.iter_space<#COO, lvls = 0> +// CHECK-LABEL: @sparse_iteration_to_scf +// // deduplication +// CHECK: scf.while {{.*}} { +// CHECK: } do { +// CHECK: } +// CHECK: scf.while {{.*}} { +// CHECK: } do { +// // actual computation +// CHECK: scf.for {{.*}} { +// CHECK: arith.addi +// CHECK: } +// // deduplication +// CHECK: scf.while {{.*}} { +// CHECK: } do { +// CHECK: } +// CHECK: scf.yield +// CHECK: } +// CHECK: return + +// COLLAPSED-LABEL: @sparse_iteration_to_scf +// COLLAPSED: %[[RET:.*]] = scf.for {{.*}} { +// COLLAPSED: %[[VAL:.*]] = arith.addi +// COLLAPSED: scf.yield %[[VAL]] : index +// COLLAPSED: } +// COLLAPSED: return %[[RET]] : index +func.func @sparse_iteration_to_scf(%sp : tensor<4x8xf32, #COO>) -> index { + %i = arith.constant 0 : index + %c1 = arith.constant 1 : index + %l1 = sparse_tensor.extract_iteration_space %sp lvls = 0 + : tensor<4x8xf32, #COO> -> !sparse_tensor.iter_space<#COO, lvls = 0> + %r1 = sparse_tensor.iterate %it1 in %l1 iter_args(%outer = %i): !sparse_tensor.iter_space<#COO, lvls = 0 to 1> -> index { + %l2 = sparse_tensor.extract_iteration_space %sp at %it1 lvls = 1 + : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 0 to 1> -> !sparse_tensor.iter_space<#COO, lvls = 1> + %r2 = sparse_tensor.iterate %it2 in %l2 iter_args(%inner = %outer): !sparse_tensor.iter_space<#COO, lvls = 1 to 2> -> index { + %k = arith.addi %inner, %c1 : index + sparse_tensor.yield %k : index + } + sparse_tensor.yield %r2 : index + } + return %r1 : index } diff --git a/mlir/test/mlir-vulkan-runner/vector-deinterleave.mlir b/mlir/test/mlir-vulkan-runner/vector-deinterleave.mlir new file mode 100644 index 00000000000000..b4d4b9aa64b243 --- /dev/null +++ b/mlir/test/mlir-vulkan-runner/vector-deinterleave.mlir @@ -0,0 +1,81 @@ +// RUN: mlir-vulkan-runner %s \ +// RUN: --shared-libs=%vulkan-runtime-wrappers,%mlir_runner_utils \ +// RUN: --entry-point-result=void --debug-only=dialect-conversion | FileCheck %s + +// CHECK: [0, 2] +// CHECK: [1, 3] +module attributes { + gpu.container_module, + spirv.target_env = #spirv.target_env< + #spirv.vce, #spirv.resource_limits<>> +} { + gpu.module @kernels { + gpu.func @kernel_vector_deinterleave(%arg0 : memref<4xi32>, %arg1 : memref<2xi32>, %arg2 : memref<2xi32>) + kernel attributes { spirv.entry_point_abi = #spirv.entry_point_abi} { + + %idx0 = arith.constant 0 : index + %idx1 = arith.constant 1 : index + %idx2 = arith.constant 2 : index + %idx3 = arith.constant 3 : index + + %src = arith.constant dense<[0, 0, 0, 0]> : vector<4xi32> + + %val0 = memref.load %arg0[%idx0] : memref<4xi32> + %val1 = memref.load %arg0[%idx1] : memref<4xi32> + %val2 = memref.load %arg0[%idx2] : memref<4xi32> + %val3 = memref.load %arg0[%idx3] : memref<4xi32> + + %src0 = vector.insert %val0, %src[0] : i32 into vector<4xi32> + %src1 = vector.insert %val1, %src0[1] : i32 into vector<4xi32> + %src2 = vector.insert %val2, %src1[2] : i32 into vector<4xi32> + %src3 = vector.insert %val3, %src2[3] : i32 into vector<4xi32> + + %res0, %res1 = vector.deinterleave %src3 : vector<4xi32> -> vector<2xi32> + + %res0_0 = vector.extract %res0[0] : i32 from vector<2xi32> + %res0_1 = vector.extract %res0[1] : i32 from vector<2xi32> + %res1_0 = vector.extract %res1[0] : i32 from vector<2xi32> + %res1_1 = vector.extract %res1[1] : i32 from vector<2xi32> + + memref.store %res0_0, %arg1[%idx0]: memref<2xi32> + memref.store %res0_1, %arg1[%idx1]: memref<2xi32> + memref.store %res1_0, %arg2[%idx0]: memref<2xi32> + memref.store %res1_1, %arg2[%idx1]: memref<2xi32> + + gpu.return + } + } + + func.func @main() { + %idx0 = arith.constant 0 : index + %idx1 = arith.constant 1 : index + %idx4 = arith.constant 4 : index + + // Allocate 3 buffers. + %buf0 = memref.alloc() : memref<4xi32> + %buf1 = memref.alloc() : memref<2xi32> + %buf2 = memref.alloc() : memref<2xi32> + + // Initialize input buffer. + %buf0_vals = arith.constant dense<[0, 1, 2, 3]> : vector<4xi32> + vector.store %buf0_vals, %buf0[%idx0] : memref<4xi32>, vector<4xi32> + + // Initialize output buffers. + %value0 = arith.constant 0 : i32 + %buf3 = memref.cast %buf1 : memref<2xi32> to memref + %buf4 = memref.cast %buf2 : memref<2xi32> to memref + call @fillResource1DInt(%buf3, %value0) : (memref, i32) -> () + call @fillResource1DInt(%buf4, %value0) : (memref, i32) -> () + + gpu.launch_func @kernels::@kernel_vector_deinterleave + blocks in (%idx4, %idx1, %idx1) threads in (%idx1, %idx1, %idx1) + args(%buf0 : memref<4xi32>, %buf1 : memref<2xi32>, %buf2 : memref<2xi32>) + %buf5 = memref.cast %buf3 : memref to memref<*xi32> + %buf6 = memref.cast %buf4 : memref to memref<*xi32> + call @printMemrefI32(%buf5) : (memref<*xi32>) -> () + call @printMemrefI32(%buf6) : (memref<*xi32>) -> () + return + } + func.func private @fillResource1DInt(%0 : memref, %1 : i32) + func.func private @printMemrefI32(%ptr : memref<*xi32>) +}