Skip to content

Commit

Permalink
[AArch64][GlobalISel] Legalize fp128 types as libcalls for G_FCMP (ll…
Browse files Browse the repository at this point in the history
…vm#98452)

- Generate libcall for supported predicates.
- Generate unsupported predicates as combinations of supported
predicates.
- Vectors are scalarized, however some cases like `v3f128_fp128` are still failing, because we failed to legalize G_OR for these types.

GISel now generates the same code as SDAG, however, note the difference
in the `one` case.
  • Loading branch information
Him188 authored Jul 25, 2024
1 parent 6f37d42 commit ba461f8
Show file tree
Hide file tree
Showing 6 changed files with 1,153 additions and 255 deletions.
3 changes: 3 additions & 0 deletions llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,9 @@ class LegalizerHelper {
LegalizeResult createResetStateLibcall(MachineIRBuilder &MIRBuilder,
MachineInstr &MI,
LostDebugLocObserver &LocObserver);
LegalizeResult createFCMPLibcall(MachineIRBuilder &MIRBuilder,
MachineInstr &MI,
LostDebugLocObserver &LocObserver);

MachineInstrBuilder
getNeutralElementForVecReduce(unsigned Opcode, MachineIRBuilder &MIRBuilder,
Expand Down
154 changes: 152 additions & 2 deletions llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -735,8 +735,7 @@ static RTLIB::Libcall getOutlineAtomicLibcall(MachineInstr &MI) {
if (MemType.isVector())
return RTLIB::UNKNOWN_LIBCALL;

#define LCALLS(A, B) \
{ A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL }
#define LCALLS(A, B) {A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL}
#define LCALL5(A) \
LCALLS(A, 1), LCALLS(A, 2), LCALLS(A, 4), LCALLS(A, 8), LCALLS(A, 16)
switch (Opc) {
Expand Down Expand Up @@ -992,6 +991,150 @@ LegalizerHelper::createSetStateLibcall(MachineIRBuilder &MIRBuilder,
LocObserver, nullptr);
}

/// Returns the corresponding libcall for the given Pred and
/// the ICMP predicate that should be generated to compare with #0
/// after the libcall.
static std::pair<RTLIB::Libcall, CmpInst::Predicate>
getFCMPLibcallDesc(const CmpInst::Predicate Pred) {

switch (Pred) {
case CmpInst::FCMP_OEQ:
return {RTLIB::OEQ_F128, CmpInst::ICMP_EQ};
case CmpInst::FCMP_UNE:
return {RTLIB::UNE_F128, CmpInst::ICMP_NE};
case CmpInst::FCMP_OGE:
return {RTLIB::OGE_F128, CmpInst::ICMP_SGE};
case CmpInst::FCMP_OLT:
return {RTLIB::OLT_F128, CmpInst::ICMP_SLT};
case CmpInst::FCMP_OLE:
return {RTLIB::OLE_F128, CmpInst::ICMP_SLE};
case CmpInst::FCMP_OGT:
return {RTLIB::OGT_F128, CmpInst::ICMP_SGT};
case CmpInst::FCMP_UNO:
return {RTLIB::UO_F128, CmpInst::ICMP_NE};
default:
return {RTLIB::UNKNOWN_LIBCALL, CmpInst::BAD_ICMP_PREDICATE};
}
}

LegalizerHelper::LegalizeResult
LegalizerHelper::createFCMPLibcall(MachineIRBuilder &MIRBuilder,
MachineInstr &MI,
LostDebugLocObserver &LocObserver) {
auto &MF = MIRBuilder.getMF();
auto &Ctx = MF.getFunction().getContext();
const GFCmp *Cmp = cast<GFCmp>(&MI);

LLT OpLLT = MRI.getType(Cmp->getLHSReg());
if (OpLLT != LLT::scalar(128) || OpLLT != MRI.getType(Cmp->getRHSReg()))
return UnableToLegalize;

Type *OpType = getFloatTypeForLLT(Ctx, OpLLT);

// DstReg type is s32
const Register DstReg = Cmp->getReg(0);
const auto Cond = Cmp->getCond();

// Reference:
// https://gcc.gnu.org/onlinedocs/gccint/Soft-float-library-routines.html#Comparison-functions-1
// Generates a libcall followed by ICMP.
const auto BuildLibcall =
[&](const RTLIB::Libcall Libcall, const CmpInst::Predicate ICmpPred,
const DstOp &Res = LLT::scalar(32)) -> Register {
// FCMP libcall always returns an i32, and needs an ICMP with #0.
constexpr LLT TempLLT = LLT::scalar(32);
Register Temp = MRI.createGenericVirtualRegister(TempLLT);
// Generate libcall, holding result in Temp
const auto Status = createLibcall(
MIRBuilder, Libcall, {Temp, Type::getInt32Ty(Ctx), 0},
{{Cmp->getLHSReg(), OpType, 0}, {Cmp->getRHSReg(), OpType, 1}},
LocObserver, &MI);
if (!Status)
return {};

// Compare temp with #0 to get the final result.
return MIRBuilder
.buildICmp(ICmpPred, Res, Temp, MIRBuilder.buildConstant(TempLLT, 0))
.getReg(0);
};

// Simple case if we have a direct mapping from predicate to libcall
if (const auto [Libcall, ICmpPred] = getFCMPLibcallDesc(Cond);
Libcall != RTLIB::UNKNOWN_LIBCALL &&
ICmpPred != CmpInst::BAD_ICMP_PREDICATE) {
if (BuildLibcall(Libcall, ICmpPred, DstReg)) {
return Legalized;
}
return UnableToLegalize;
}

// No direct mapping found, should be generated as combination of libcalls.

switch (Cond) {
case CmpInst::FCMP_UEQ: {
// FCMP_UEQ: unordered or equal
// Convert into (FCMP_OEQ || FCMP_UNO).

const auto [OeqLibcall, OeqPred] = getFCMPLibcallDesc(CmpInst::FCMP_OEQ);
const auto Oeq = BuildLibcall(OeqLibcall, OeqPred);

const auto [UnoLibcall, UnoPred] = getFCMPLibcallDesc(CmpInst::FCMP_UNO);
const auto Uno = BuildLibcall(UnoLibcall, UnoPred);
if (Oeq && Uno)
MIRBuilder.buildOr(DstReg, Oeq, Uno);
else
return UnableToLegalize;

break;
}
case CmpInst::FCMP_ONE: {
// FCMP_ONE: ordered and operands are unequal
// Convert into (!FCMP_OEQ && !FCMP_UNO).

// We inverse the predicate instead of generating a NOT
// to save one instruction.
// On AArch64 isel can even select two cmp into a single ccmp.
const auto [OeqLibcall, OeqPred] = getFCMPLibcallDesc(CmpInst::FCMP_OEQ);
const auto NotOeq =
BuildLibcall(OeqLibcall, CmpInst::getInversePredicate(OeqPred));

const auto [UnoLibcall, UnoPred] = getFCMPLibcallDesc(CmpInst::FCMP_UNO);
const auto NotUno =
BuildLibcall(UnoLibcall, CmpInst::getInversePredicate(UnoPred));

if (NotOeq && NotUno)
MIRBuilder.buildAnd(DstReg, NotOeq, NotUno);
else
return UnableToLegalize;

break;
}
case CmpInst::FCMP_ULT:
case CmpInst::FCMP_UGE:
case CmpInst::FCMP_UGT:
case CmpInst::FCMP_ULE:
case CmpInst::FCMP_ORD: {
// Convert into: !(inverse(Pred))
// E.g. FCMP_ULT becomes !FCMP_OGE
// This is equivalent to the following, but saves some instructions.
// MIRBuilder.buildNot(
// PredTy,
// MIRBuilder.buildFCmp(CmpInst::getInversePredicate(Pred), PredTy,
// Op1, Op2));
const auto [InversedLibcall, InversedPred] =
getFCMPLibcallDesc(CmpInst::getInversePredicate(Cond));
if (!BuildLibcall(InversedLibcall,
CmpInst::getInversePredicate(InversedPred), DstReg))
return UnableToLegalize;
break;
}
default:
return UnableToLegalize;
}

return Legalized;
}

// The function is used to legalize operations that set default environment
// state. In C library a call like `fesetmode(FE_DFL_MODE)` is used for that.
// On most targets supported in glibc FE_DFL_MODE is defined as
Expand Down Expand Up @@ -1138,6 +1281,13 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
return Status;
break;
}
case TargetOpcode::G_FCMP: {
LegalizeResult Status = createFCMPLibcall(MIRBuilder, MI, LocObserver);
if (Status != Legalized)
return Status;
MI.eraseFromParent();
return Status;
}
case TargetOpcode::G_FPTOSI:
case TargetOpcode::G_FPTOUI: {
// FIXME: Support other types
Expand Down
6 changes: 4 additions & 2 deletions llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -561,7 +561,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
})
.widenScalarOrEltToNextPow2(1)
.clampScalar(0, s32, s32)
.clampScalarOrElt(1, MinFPScalar, s64)
.minScalarOrElt(1, MinFPScalar)
.scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
.minScalarEltSameAsIf(
[=](const LegalityQuery &Query) {
const LLT &Ty = Query.Types[0];
Expand All @@ -573,7 +574,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.clampNumElements(1, v4s16, v8s16)
.clampNumElements(1, v2s32, v4s32)
.clampMaxNumElements(1, s64, 2)
.moreElementsToNextPow2(1);
.moreElementsToNextPow2(1)
.libcallFor({{s32, s128}});

// Extensions
auto ExtLegalFunc = [=](const LegalityQuery &Query) {
Expand Down
70 changes: 46 additions & 24 deletions llvm/test/CodeGen/AArch64/arm64-ccmp.ll
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -debugify-and-strip-all-safe -mcpu=cyclone -verify-machineinstrs -aarch64-enable-ccmp -aarch64-stress-ccmp | FileCheck %s --check-prefixes=CHECK,SDISEL
; RUN: llc < %s -debugify-and-strip-all-safe -mcpu=cyclone -verify-machineinstrs -aarch64-enable-ccmp -aarch64-stress-ccmp -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=CHECK,GISEL
; RUN: llc < %s -debugify-and-strip-all-safe -mcpu=cyclone -verify-machineinstrs -aarch64-enable-ccmp -aarch64-stress-ccmp -global-isel | FileCheck %s --check-prefixes=CHECK,GISEL
target triple = "arm64-apple-ios"

define i32 @single_same(i32 %a, i32 %b) nounwind ssp {
Expand Down Expand Up @@ -950,29 +950,51 @@ define i32 @half_select_and_olt_one(half %v0, half %v1, half %v2, half %v3, i32
; Also verify that we don't try to generate f128 FCCMPs, using RT calls instead.

define i32 @f128_select_and_olt_oge(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32 %b) #0 {
; CHECK-LABEL: f128_select_and_olt_oge:
; CHECK: ; %bb.0:
; CHECK-NEXT: sub sp, sp, #80
; CHECK-NEXT: stp x22, x21, [sp, #32] ; 16-byte Folded Spill
; CHECK-NEXT: stp x20, x19, [sp, #48] ; 16-byte Folded Spill
; CHECK-NEXT: stp x29, x30, [sp, #64] ; 16-byte Folded Spill
; CHECK-NEXT: mov x19, x1
; CHECK-NEXT: mov x20, x0
; CHECK-NEXT: stp q2, q3, [sp] ; 32-byte Folded Spill
; CHECK-NEXT: bl ___lttf2
; CHECK-NEXT: cmp w0, #0
; CHECK-NEXT: cset w21, lt
; CHECK-NEXT: ldp q0, q1, [sp] ; 32-byte Folded Reload
; CHECK-NEXT: bl ___getf2
; CHECK-NEXT: cmp w0, #0
; CHECK-NEXT: cset w8, ge
; CHECK-NEXT: tst w8, w21
; CHECK-NEXT: csel w0, w20, w19, ne
; CHECK-NEXT: ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
; CHECK-NEXT: ldp x20, x19, [sp, #48] ; 16-byte Folded Reload
; CHECK-NEXT: ldp x22, x21, [sp, #32] ; 16-byte Folded Reload
; CHECK-NEXT: add sp, sp, #80
; CHECK-NEXT: ret
; SDISEL-LABEL: f128_select_and_olt_oge:
; SDISEL: ; %bb.0:
; SDISEL-NEXT: sub sp, sp, #80
; SDISEL-NEXT: stp x22, x21, [sp, #32] ; 16-byte Folded Spill
; SDISEL-NEXT: stp x20, x19, [sp, #48] ; 16-byte Folded Spill
; SDISEL-NEXT: stp x29, x30, [sp, #64] ; 16-byte Folded Spill
; SDISEL-NEXT: mov x19, x1
; SDISEL-NEXT: mov x20, x0
; SDISEL-NEXT: stp q2, q3, [sp] ; 32-byte Folded Spill
; SDISEL-NEXT: bl ___lttf2
; SDISEL-NEXT: cmp w0, #0
; SDISEL-NEXT: cset w21, lt
; SDISEL-NEXT: ldp q0, q1, [sp] ; 32-byte Folded Reload
; SDISEL-NEXT: bl ___getf2
; SDISEL-NEXT: cmp w0, #0
; SDISEL-NEXT: cset w8, ge
; SDISEL-NEXT: tst w8, w21
; SDISEL-NEXT: csel w0, w20, w19, ne
; SDISEL-NEXT: ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
; SDISEL-NEXT: ldp x20, x19, [sp, #48] ; 16-byte Folded Reload
; SDISEL-NEXT: ldp x22, x21, [sp, #32] ; 16-byte Folded Reload
; SDISEL-NEXT: add sp, sp, #80
; SDISEL-NEXT: ret
;
; GISEL-LABEL: f128_select_and_olt_oge:
; GISEL: ; %bb.0:
; GISEL-NEXT: sub sp, sp, #80
; GISEL-NEXT: stp x22, x21, [sp, #32] ; 16-byte Folded Spill
; GISEL-NEXT: stp x20, x19, [sp, #48] ; 16-byte Folded Spill
; GISEL-NEXT: stp x29, x30, [sp, #64] ; 16-byte Folded Spill
; GISEL-NEXT: stp q3, q2, [sp] ; 32-byte Folded Spill
; GISEL-NEXT: mov x19, x0
; GISEL-NEXT: mov x20, x1
; GISEL-NEXT: bl ___lttf2
; GISEL-NEXT: mov x21, x0
; GISEL-NEXT: ldp q1, q0, [sp] ; 32-byte Folded Reload
; GISEL-NEXT: bl ___getf2
; GISEL-NEXT: cmp w21, #0
; GISEL-NEXT: ccmp w0, #0, #8, lt
; GISEL-NEXT: csel w0, w19, w20, ge
; GISEL-NEXT: ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
; GISEL-NEXT: ldp x20, x19, [sp, #48] ; 16-byte Folded Reload
; GISEL-NEXT: ldp x22, x21, [sp, #32] ; 16-byte Folded Reload
; GISEL-NEXT: add sp, sp, #80
; GISEL-NEXT: ret
%c0 = fcmp olt fp128 %v0, %v1
%c1 = fcmp oge fp128 %v2, %v3
%cr = and i1 %c1, %c0
Expand Down
Loading

0 comments on commit ba461f8

Please sign in to comment.