-
Notifications
You must be signed in to change notification settings - Fork 12.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV] Set a barrier between mask producer and user of V0 #114012
[RISCV] Set a barrier between mask producer and user of V0 #114012
Conversation
@llvm/pr-subscribers-backend-risc-v Author: Pengcheng Wang (wangpc-pp) ChangesHere we add a scheduling mutation in pre-ra scheduling, which will This prevents making live intervals of mask registers longer and as From the test changes, we can see some improvements and also some Partially fixes #113489. Patch is 435.60 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/114012.diff 33 Files Affected:
diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt
index fd049d1a57860e..b95ad9dd428cc9 100644
--- a/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -58,6 +58,7 @@ add_llvm_target(RISCVCodeGen
RISCVTargetMachine.cpp
RISCVTargetObjectFile.cpp
RISCVTargetTransformInfo.cpp
+ RISCVVectorMaskDAGMutation.cpp
RISCVVectorPeephole.cpp
RISCVVLOptimizer.cpp
RISCVZacasABIFix.cpp
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 089dc6c529193d..b88bd18e7c8585 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -360,6 +360,12 @@ class RISCVPassConfig : public TargetPassConfig {
DAG->addMutation(createStoreClusterDAGMutation(
DAG->TII, DAG->TRI, /*ReorderWhileClustering=*/true));
}
+
+ const RISCVSubtarget &ST = C->MF->getSubtarget<RISCVSubtarget>();
+ if (ST.hasVInstructions()) {
+ DAG = DAG ? DAG : createGenericSchedLive(C);
+ DAG->addMutation(createRISCVVectorMaskDAGMutation(DAG->TRI));
+ }
return DAG;
}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.h b/llvm/lib/Target/RISCV/RISCVTargetMachine.h
index ce7b7907e1f3af..1a37891f847ae6 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.h
@@ -61,6 +61,10 @@ class RISCVTargetMachine : public LLVMTargetMachine {
SMRange &SourceRange) const override;
void registerPassBuilderCallbacks(PassBuilder &PB) override;
};
+
+std::unique_ptr<ScheduleDAGMutation>
+createRISCVVectorMaskDAGMutation(const TargetRegisterInfo *TRI);
+
} // namespace llvm
#endif
diff --git a/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp b/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp
new file mode 100644
index 00000000000000..5bdfdd696dd627
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp
@@ -0,0 +1,102 @@
+//===- RISCVVectorMaskDAGMutation.cpp - RISCV Vector Mask DAGMutation -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A schedule mutation that add a dependency between masks producing
+// instructions and masked instructions, so that we will extend the live
+// interval of mask register.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/RISCVMCTargetDesc.h"
+#include "RISCVTargetMachine.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+
+#define DEBUG_TYPE "machine-scheduler"
+
+namespace llvm {
+
+static inline bool isVectorMaskProducer(const MachineInstr *MI) {
+ switch (RISCV::getRVVMCOpcode(MI->getOpcode())) {
+ // Vector Mask Instructions
+ case RISCV::VMAND_MM:
+ case RISCV::VMNAND_MM:
+ case RISCV::VMANDN_MM:
+ case RISCV::VMXOR_MM:
+ case RISCV::VMOR_MM:
+ case RISCV::VMNOR_MM:
+ case RISCV::VMORN_MM:
+ case RISCV::VMXNOR_MM:
+ case RISCV::VMSBF_M:
+ case RISCV::VMSIF_M:
+ case RISCV::VMSOF_M:
+ case RISCV::VIOTA_M:
+ // Vector Integer Compare Instructions
+ case RISCV::VMSEQ_VV:
+ case RISCV::VMSEQ_VX:
+ case RISCV::VMSEQ_VI:
+ case RISCV::VMSNE_VV:
+ case RISCV::VMSNE_VX:
+ case RISCV::VMSNE_VI:
+ case RISCV::VMSLT_VV:
+ case RISCV::VMSLT_VX:
+ case RISCV::VMSLTU_VV:
+ case RISCV::VMSLTU_VX:
+ case RISCV::VMSLE_VV:
+ case RISCV::VMSLE_VX:
+ case RISCV::VMSLE_VI:
+ case RISCV::VMSLEU_VV:
+ case RISCV::VMSLEU_VX:
+ case RISCV::VMSLEU_VI:
+ case RISCV::VMSGTU_VX:
+ case RISCV::VMSGTU_VI:
+ case RISCV::VMSGT_VX:
+ case RISCV::VMSGT_VI:
+ // Vector Floating-Point Compare Instructions
+ case RISCV::VMFEQ_VV:
+ case RISCV::VMFEQ_VF:
+ case RISCV::VMFNE_VV:
+ case RISCV::VMFNE_VF:
+ case RISCV::VMFLT_VV:
+ case RISCV::VMFLT_VF:
+ case RISCV::VMFLE_VV:
+ case RISCV::VMFLE_VF:
+ case RISCV::VMFGT_VF:
+ case RISCV::VMFGE_VF:
+ return true;
+ }
+ return false;
+}
+
+class RISCVVectorMaskDAGMutation : public ScheduleDAGMutation {
+private:
+ const TargetRegisterInfo *TRI;
+
+public:
+ RISCVVectorMaskDAGMutation(const TargetRegisterInfo *TRI) : TRI(TRI) {}
+
+ void apply(ScheduleDAGInstrs *DAG) override {
+ SUnit *NearestUseV0SU = nullptr;
+ for (SUnit &SU : DAG->SUnits) {
+ const MachineInstr *MI = SU.getInstr();
+ if (MI->findRegisterUseOperand(RISCV::V0, TRI))
+ NearestUseV0SU = &SU;
+
+ if (NearestUseV0SU && NearestUseV0SU != &SU && isVectorMaskProducer(MI))
+ DAG->addEdge(&SU, SDep(NearestUseV0SU, SDep::Artificial));
+ }
+ }
+};
+
+std::unique_ptr<ScheduleDAGMutation>
+createRISCVVectorMaskDAGMutation(const TargetRegisterInfo *TRI) {
+ return std::make_unique<RISCVVectorMaskDAGMutation>(TRI);
+}
+
+} // namespace llvm
diff --git a/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll b/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll
index 7839b602706db1..113154c0f9855b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll
@@ -19,19 +19,18 @@ define void @constant_folding_crash(ptr %v54, <4 x ptr> %lanes.a, <4 x ptr> %lan
; RV32-LABEL: constant_folding_crash:
; RV32: # %bb.0: # %entry
; RV32-NEXT: lw a0, 8(a0)
+; RV32-NEXT: vmv1r.v v10, v0
; RV32-NEXT: andi a0, a0, 1
; RV32-NEXT: seqz a0, a0
; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
-; RV32-NEXT: vmv.v.x v10, a0
-; RV32-NEXT: vmsne.vi v10, v10, 0
-; RV32-NEXT: vmv1r.v v11, v0
-; RV32-NEXT: vmv1r.v v0, v10
+; RV32-NEXT: vmv.v.x v11, a0
+; RV32-NEXT: vmsne.vi v0, v11, 0
; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; RV32-NEXT: vmerge.vvm v8, v9, v8, v0
; RV32-NEXT: vmv.x.s a0, v8
; RV32-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
; RV32-NEXT: vmv.v.i v8, 0
-; RV32-NEXT: vmv1r.v v0, v11
+; RV32-NEXT: vmv1r.v v0, v10
; RV32-NEXT: vmerge.vim v8, v8, 1, v0
; RV32-NEXT: vrgather.vi v9, v8, 0
; RV32-NEXT: vmsne.vi v0, v9, 0
@@ -43,19 +42,18 @@ define void @constant_folding_crash(ptr %v54, <4 x ptr> %lanes.a, <4 x ptr> %lan
; RV64-LABEL: constant_folding_crash:
; RV64: # %bb.0: # %entry
; RV64-NEXT: ld a0, 8(a0)
+; RV64-NEXT: vmv1r.v v12, v0
; RV64-NEXT: andi a0, a0, 1
; RV64-NEXT: seqz a0, a0
; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
-; RV64-NEXT: vmv.v.x v12, a0
-; RV64-NEXT: vmsne.vi v12, v12, 0
-; RV64-NEXT: vmv1r.v v13, v0
-; RV64-NEXT: vmv1r.v v0, v12
+; RV64-NEXT: vmv.v.x v13, a0
+; RV64-NEXT: vmsne.vi v0, v13, 0
; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma
; RV64-NEXT: vmerge.vvm v8, v10, v8, v0
; RV64-NEXT: vmv.x.s a0, v8
; RV64-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
; RV64-NEXT: vmv.v.i v8, 0
-; RV64-NEXT: vmv1r.v v0, v13
+; RV64-NEXT: vmv1r.v v0, v12
; RV64-NEXT: vmerge.vim v8, v8, 1, v0
; RV64-NEXT: vrgather.vi v9, v8, 0
; RV64-NEXT: vmsne.vi v0, v9, 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
index 51eb63f5f92212..216300b23f4524 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
@@ -52,11 +52,10 @@ define <2 x half> @vfmax_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %vb, i32 z
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v10, v9, v9
-; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT: vmv1r.v v0, v10
+; ZVFH-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT: vmfeq.vv v0, v9, v9
; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT: vfmax.vv v8, v8, v11
+; ZVFH-NEXT: vfmax.vv v8, v8, v10
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmax_vv_v2f16_unmasked:
@@ -66,12 +65,11 @@ define <2 x half> @vfmax_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %vb, i32 z
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10
; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11
-; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v11, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v11, v10, v0
+; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v8, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v10, v0
; ZVFHMIN-NEXT: vfmax.vv v9, v8, v9
; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
@@ -124,11 +122,10 @@ define <4 x half> @vfmax_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %vb, i32 z
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v10, v9, v9
-; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT: vmv1r.v v0, v10
+; ZVFH-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT: vmfeq.vv v0, v9, v9
; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT: vfmax.vv v8, v8, v11
+; ZVFH-NEXT: vfmax.vv v8, v8, v10
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmax_vv_v4f16_unmasked:
@@ -138,12 +135,11 @@ define <4 x half> @vfmax_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %vb, i32 z
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma
; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10
; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11
-; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v11, v0
-; ZVFHMIN-NEXT: vmv.v.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v11, v10, v0
+; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v8, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v10, v0
; ZVFHMIN-NEXT: vfmax.vv v9, v8, v9
; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
@@ -198,11 +194,10 @@ define <8 x half> @vfmax_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %vb, i32 z
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v10, v9, v9
-; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0
-; ZVFH-NEXT: vmv.v.v v0, v10
+; ZVFH-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFH-NEXT: vmfeq.vv v0, v9, v9
; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0
-; ZVFH-NEXT: vfmax.vv v8, v8, v11
+; ZVFH-NEXT: vfmax.vv v8, v8, v10
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmax_vv_v8f16_unmasked:
@@ -214,11 +209,10 @@ define <8 x half> @vfmax_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %vb, i32 z
; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12
-; ZVFHMIN-NEXT: vmerge.vvm v14, v10, v12, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v10, v0
-; ZVFHMIN-NEXT: vfmax.vv v10, v8, v14
+; ZVFHMIN-NEXT: vmerge.vvm v8, v10, v12, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12
+; ZVFHMIN-NEXT: vmerge.vvm v10, v12, v10, v0
+; ZVFHMIN-NEXT: vfmax.vv v10, v10, v8
; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10
; ZVFHMIN-NEXT: ret
@@ -274,11 +268,10 @@ define <16 x half> @vfmax_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %vb, i
; ZVFH: # %bb.0:
; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmfeq.vv v12, v10, v10
-; ZVFH-NEXT: vmerge.vvm v14, v8, v10, v0
-; ZVFH-NEXT: vmv1r.v v0, v12
+; ZVFH-NEXT: vmerge.vvm v12, v8, v10, v0
+; ZVFH-NEXT: vmfeq.vv v0, v10, v10
; ZVFH-NEXT: vmerge.vvm v8, v10, v8, v0
-; ZVFH-NEXT: vfmax.vv v8, v8, v14
+; ZVFH-NEXT: vfmax.vv v8, v8, v12
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmax_vv_v16f16_unmasked:
@@ -290,11 +283,10 @@ define <16 x half> @vfmax_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %vb, i
; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT: vmfeq.vv v8, v16, v16
-; ZVFHMIN-NEXT: vmerge.vvm v20, v12, v16, v0
-; ZVFHMIN-NEXT: vmv1r.v v0, v8
-; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v12, v0
-; ZVFHMIN-NEXT: vfmax.vv v12, v8, v20
+; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v16, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16
+; ZVFHMIN-NEXT: vmerge.vvm v12, v16, v12, v0
+; ZVFHMIN-NEXT: vfmax.vv v12, v12, v8
; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12
; ZVFHMIN-NEXT: ret
@@ -326,11 +318,10 @@ define <2 x float> @vfmax_vv_v2f32_unmasked(<2 x float> %va, <2 x float> %vb, i3
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v10, v9, v9
-; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT: vmfeq.vv v0, v9, v9
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v11
+; CHECK-NEXT: vfmax.vv v8, v8, v10
; CHECK-NEXT: ret
%v = call <2 x float> @llvm.vp.maximum.v2f32(<2 x float> %va, <2 x float> %vb, <2 x i1> splat (i1 true), i32 %evl)
ret <2 x float> %v
@@ -360,11 +351,10 @@ define <4 x float> @vfmax_vv_v4f32_unmasked(<4 x float> %va, <4 x float> %vb, i3
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v10, v9, v9
-; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT: vmfeq.vv v0, v9, v9
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v11
+; CHECK-NEXT: vfmax.vv v8, v8, v10
; CHECK-NEXT: ret
%v = call <4 x float> @llvm.vp.maximum.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> splat (i1 true), i32 %evl)
ret <4 x float> %v
@@ -396,11 +386,10 @@ define <8 x float> @vfmax_vv_v8f32_unmasked(<8 x float> %va, <8 x float> %vb, i3
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v12, v10, v10
-; CHECK-NEXT: vmerge.vvm v14, v8, v10, v0
-; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmerge.vvm v12, v8, v10, v0
+; CHECK-NEXT: vmfeq.vv v0, v10, v10
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v14
+; CHECK-NEXT: vfmax.vv v8, v8, v12
; CHECK-NEXT: ret
%v = call <8 x float> @llvm.vp.maximum.v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> splat (i1 true), i32 %evl)
ret <8 x float> %v
@@ -432,11 +421,10 @@ define <16 x float> @vfmax_vv_v16f32_unmasked(<16 x float> %va, <16 x float> %vb
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v16, v12, v12
-; CHECK-NEXT: vmerge.vvm v20, v8, v12, v0
-; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vmerge.vvm v16, v8, v12, v0
+; CHECK-NEXT: vmfeq.vv v0, v12, v12
; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v20
+; CHECK-NEXT: vfmax.vv v8, v8, v16
; CHECK-NEXT: ret
%v = call <16 x float> @llvm.vp.maximum.v16f32(<16 x float> %va, <16 x float> %vb, <16 x i1> splat (i1 true), i32 %evl)
ret <16 x float> %v
@@ -466,11 +454,10 @@ define <2 x double> @vfmax_vv_v2f64_unmasked(<2 x double> %va, <2 x double> %vb,
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v10, v9, v9
-; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0
-; CHECK-NEXT: vmv.v.v v0, v10
+; CHECK-NEXT: vmerge.vvm v10, v8, v9, v0
+; CHECK-NEXT: vmfeq.vv v0, v9, v9
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v11
+; CHECK-NEXT: vfmax.vv v8, v8, v10
; CHECK-NEXT: ret
%v = call <2 x double> @llvm.vp.maximum.v2f64(<2 x double> %va, <2 x double> %vb, <2 x i1> splat (i1 true), i32 %evl)
ret <2 x double> %v
@@ -502,11 +489,10 @@ define <4 x double> @vfmax_vv_v4f64_unmasked(<4 x double> %va, <4 x double> %vb,
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v12, v10, v10
-; CHECK-NEXT: vmerge.vvm v14, v8, v10, v0
-; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmerge.vvm v12, v8, v10, v0
+; CHECK-NEXT: vmfeq.vv v0, v10, v10
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v14
+; CHECK-NEXT: vfmax.vv v8, v8, v12
; CHECK-NEXT: ret
%v = call <4 x double> @llvm.vp.maximum.v4f64(<4 x double> %va, <4 x double> %vb, <4 x i1> splat (i1 true), i32 %evl)
ret <4 x double> %v
@@ -538,11 +524,10 @@ define <8 x double> @vfmax_vv_v8f64_unmasked(<8 x double> %va, <8 x double> %vb,
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v16, v12, v12
-; CHECK-NEXT: vmerge.vvm v20, v8, v12, v0
-; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vmerge.vvm v16, v8, v12, v0
+; CHECK-NEXT: vmfeq.vv v0, v12, v12
; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0
-; CHECK-NEXT: vfmax.vv v8, v8, v20
+; CHECK-NEXT: vfmax.vv v8, v8, v16
; CHECK-NEXT: ret
%v = call <8 x double> @llvm.vp.maximum.v8f64(<8 x double> %va, <8 x double> %vb, <8 x i1> splat (i1 true), i32 %evl)
ret <8 x double> %v
@@ -587,9 +572,8 @@ define <16 x double> @vfmax_vv_v16f64_unmasked(<16 x double> %va, <16 x double>
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v7, v16, v16
; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT: vmv1r.v v0, v7
+; CHECK-NEXT: vmfeq.vv v0, v16, v16
; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
; CHECK-NEXT: vfmax.vv v8, v8, v24
; CHECK-NEXT: ret
@@ -710,21 +694,25 @@ define <32 x double> @vfmax_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: li a3, 24
+; CHECK-NEXT: li a3, 40
; CHECK-NEXT: mul a1, a1, a3
; CHECK-NEXT: sub sp, sp, a1
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
; CHECK-NEXT: addi a1, a0, 128
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v24, (a1)
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: li a3, 24
+; CHECK-NEXT: mul a1, a1, a3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: vle64.v v24, (a0)
; CHECK-NEXT: li a1, 16
-; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 5
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: mv a0, a2
; CHECK-NEXT: bltu a2, a1, .LBB25_2
@@ -733,52 +721,66 @@ define <32 x double> @vfmax_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
; CHECK-NEXT: .LBB25_2:
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
; CHECK-NEXT: vmfeq.vv v0, v8, v8
-; CHECK-NEXT: vmfeq.vv v7, v24, v24
; CHECK-NEXT: vmv8r.v v16, v24
; CHECK-NEXT: vmerge.vvm v24, v8, v24, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK...
[truncated]
|
b888e06
to
327d658
Compare
; ZVFHMIN-NEXT: vmerge.vvm v8, v10, v9, v0 | ||
; ZVFHMIN-NEXT: vfmax.vv v9, v8, v11 | ||
; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 | ||
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
An example that causes more vtype toggles.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I want to make sure I understand what is going on in this patch.
Prior to this patch, we should have data dependency edges between an instruction that consumes the mask and the instruction that produces it. This patch on the other hand adds artificial edges between v0 mask producer instruction and the previous consumer of a v0 mask.
We might have a program like this:
livein: v0
v0_a = produce v0
v0_b = consume_and_produce v0
c = consume v0
Without this patch, we have two data dependency edges: (v0_b, v0_a)
and (c, v0_b)
. With this patch, I think are adding an artificial edge from (v0_a, v0_b)
.
Can you help me understand how this leads to making live intervals of mask registers shorter?
I think the idea is that the artificial edge constrains the scheduler from reordering the mask producer with the earlier mask use, thus preserving a non-overlapping live range if one already exists. My question is why we need this? Shouldn't register pressure on the mask register class (which only has one register) achieve this result? It clearly doesn't, but why? Is there something else we could tweak here? Note that this code doesn't appear to consider the case where the original schedule already has V0 live ranges which overlap. That's probably fixable with some one use checks. |
@michaelmaitland I think @preames has answered your question perfectly, the aim of this patch is to reduce the live range overlap (yeah I should use this term) of mask registers.
I'll try to answer these questions with my rough understanding. There are some problems with current implementation in LLVM:
Yes, as what I can see, I have to admit that this patch seems to be a compromise. |
I'm fine with this patch as long as we know what we're working around. :) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM - Seems like a reasonable workaround for a real issue and a few days have gone by with no other suggestions made.
f21427c
to
75b04ec
Compare
I will land this in a few days if there is no more comment. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Here we add a scheduling mutation in pre-ra scheduling, which will adds an artificial dependency edge between mask producer and its previous nearest instruction that uses V0 register. This prevents the overlap of live intervals of mask registers and as a consequence we can reduce some spills/moves. From the test changes, we can see some improvements and also some regressions (more vtype toggles). Partially fixes llvm#113489.
56712da
to
055f429
Compare
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/38/builds/1023 Here is the relevant piece of the build log for the reference
|
Here we add a scheduling mutation in pre-ra scheduling, which will
add an artificial dependency edge between mask producer and its
previous nearest instruction that uses V0 register.
This prevents the overlap of live intervals of mask registers and
as a consequence we can reduce some spills/moves.
From the test changes, we can see some improvements and also some
regressions (more vtype toggles).
Partially fixes #113489.