diff --git a/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp b/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp index b86ea0e0b65b..30374101303e 100644 --- a/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp +++ b/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp @@ -275,27 +275,14 @@ class RegionEndEdges : public ScheduleDAGMutation { assert(EdgeLatency < DelaySlots); EdgeLatency = DelaySlots + 1; } + // Between writing Registers (lc, le, ls) and the end of the loop, // there must be a distance of 112 bytes in terms of PM addresses. // 112 bytes correspond to 7 fully-expanded 128-bit instructions and // hence adding a latency of 8 from LoopStart to the ExitSU. - // We can subtract the number of bundles that interblock pushed into - // BottomInsert - // FIXME: this holds as long as we insert them unconditionally. If we - // integrate them with the bottom region, we just need to keep 8 away - // from ExitSU if (TII->isZeroOverheadLoopSetupInstr(MI)) { - unsigned PatchCycles = 8; - if (DAG->getBB()) { - auto *Scheduler = - static_cast(DAG)->getSchedImpl(); - auto &InterBlock = Scheduler->getInterBlock(); - unsigned InsertedCycles = - InterBlock.getBlockState(DAG->getBB()).BottomInsert.size(); - PatchCycles = - PatchCycles >= InsertedCycles ? PatchCycles - InsertedCycles : 0; - } - EdgeLatency = std::max(EdgeLatency, PatchCycles); + const unsigned ZOLDistance = 8; + EdgeLatency = std::max(EdgeLatency, ZOLDistance); } ExitDep.setLatency(EdgeLatency); @@ -318,6 +305,49 @@ class RegionEndEdges : public ScheduleDAGMutation { }; }; +class EmitFixedSUnits : public ScheduleDAGMutation { +public: + void apply(ScheduleDAGInstrs *DAG) override { + AIEPostRASchedStrategy *Scheduler = + static_cast(DAG)->getSchedImpl(); + auto *TII = static_cast(DAG->TII); + auto *ItinData = DAG->MF.getSubtarget().getInstrItineraryData(); + const BlockState &BS = + Scheduler->getInterBlock().getBlockState(DAG->getBB()); + const Region &CurRegion = BS.getCurrentRegion(); + + // First, create SUnits for all "fixed" instructions + unsigned DistToExitSU = 0; + for (MachineInstr &MI : reverse(CurRegion.bot_fixed_instrs())) { + Scheduler->addFixedSUnit(MI, /*IsTop=*/false, DistToExitSU); + ++DistToExitSU; + } + DAG->makeMaps(); + + // Then, create create dependencies between "free" and "fixed" instructions + auto IsFreeSU = [Scheduler](const SUnit &SU) { + return Scheduler->isFreeSU(SU); + }; + ArrayRef BotFixedBundles = + CurRegion.getBotFixedBundles(); + for (SUnit &FreeSU : make_filter_range(DAG->SUnits, IsFreeSU)) { + const MachineInstr &MI = *FreeSU.getInstr(); + MachineInstr *FixedDepMI = + AIE::findEarliestRef(MI, BotFixedBundles, BotFixedBundles.size()).MI; + if (!FixedDepMI) + continue; + + SUnit *FixedDepSU = + DAG->getSUnit(&*getBundleStart(FixedDepMI->getIterator())); + assert(FixedDepSU && "Fixed Bundle has no corresponding SU."); + SDep Dep(&FreeSU, SDep::Artificial); + Dep.setLatency( + AIE::maxLatency(&MI, *TII, *ItinData, /*IncludeStages=*/true)); + FixedDepSU->addPred(Dep, /*Required=*/true); + } + } +}; + /// Collect all "weak" edges in a separate vector. This allows modifying /// \p SU.Preds without invalidating iterators. SmallVector getWeakPreds(SUnit &SU) { @@ -664,6 +694,7 @@ AIEBaseSubtarget::getPostRAMutationsImpl(const Triple &TT) { Mutations.emplace_back(std::make_unique()); Mutations.emplace_back(std::make_unique()); Mutations.emplace_back(std::make_unique()); + Mutations.emplace_back(std::make_unique()); } return Mutations; } diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp index fd30592b855a..e67b338bf081 100644 --- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp +++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp @@ -717,7 +717,9 @@ void InterBlockScheduling::enterRegion(MachineBasicBlock *BB, if (BS.Kind != BlockType::Loop || BS.FixPoint.Stage == SchedulingStage::GatheringRegions) { ArrayRef TopFixedBundles; - ArrayRef BotFixedBundles; + ArrayRef BotFixedBundles = + RegionEnd == BB->end() ? ArrayRef(BS.BottomInsert) + : ArrayRef(); BS.addRegion(BB, RegionBegin, RegionEnd, TopFixedBundles, BotFixedBundles); } } diff --git a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp index 15429b362ef5..aca0146851c1 100644 --- a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp +++ b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp @@ -270,6 +270,7 @@ void AIEPostRASchedStrategy::initializeBotScoreBoard(ScoreboardTrust Trust) { assert(!doMBBSchedRegionsTopDown()); AIEHazardRecognizer *BotHazardRec = getAIEHazardRecognizer(Bot); const int Depth = BotHazardRec->getMaxLookAhead(); + assert(unsigned(Depth) >= BotHazardRec->getPipelineDepth()); /// These lambdas are an abstraction of the scoreboard manipulations, /// hiding the details of the implementation. In particular, we need to @@ -479,27 +480,78 @@ SUnit *AIEPostRASchedStrategy::pickNodeAndCycle( } int AIEPostRASchedStrategy::getMaxDeltaCycles(const SchedBoundary &Zone) const { - assert(!Zone.isTop()); - if (Zone.getCurrCycle() >= RegionBottomUpCycles - 1) + // Top-down scheduling does not support DeltaCycles + if (Zone.isTop() || Zone.getCurrCycle() >= RegionBottomUpCycles - 1) return 0; return std::min({int(RegionBottomUpCycles - 1 - Zone.getCurrCycle()), int(getAIEHazardRecognizer(Zone)->getMaxLookAhead()), BottomUpDelta.getValue()}); } +unsigned getNumEmittedInstrs(ScheduleDAGMI *DAG, bool IsTop) { + if (IsTop) + return DAG->top().isValid() ? std::distance(DAG->begin(), DAG->top()) : 0; + return DAG->bottom().isValid() ? std::distance(DAG->bottom(), DAG->end()) : 0; +} + +SUnit *AIEPostRASchedStrategy::getNextUnscheduledFixedInstr( + const SchedBoundary &Zone) const { + if (Zone.isTop()) + return nullptr; + const Region &Reg = InterBlock.getBlockState(CurMBB).getCurrentRegion(); + unsigned NumEmitted = getNumEmittedInstrs(DAG, /*IsTop=*/false); + if (NumEmitted < Reg.getNumBotFixedInstructions()) { + MachineInstr &NextMI = + *std::prev(DAG->bottom().isValid() ? DAG->bottom() : DAG->end()); + SUnit *NextSU = DAG->getSUnit(&NextMI); + assert(NextSU); + assert(NextSU->BotReadyCycle == NextSU->getHeight() && + "Fixed instruction won't be placed at the correct cycle"); + assert(Zone.getCurrCycle() <= NextSU->BotReadyCycle); + return NextSU; + } + return nullptr; +} + +bool AIEPostRASchedStrategy::isFixedSU(const SUnit &SU, bool IsTop) const { + if (IsTop) { + return FirstTopFixedSU && SU.NodeNum >= *FirstTopFixedSU && + SU.NodeNum < FirstBotFixedSU.value_or(DAG->SUnits.size()); + } + return FirstBotFixedSU && SU.NodeNum >= *FirstBotFixedSU && + SU.NodeNum <= LastBotFixedSU.value(); +} + +bool AIEPostRASchedStrategy::isFreeSU(const SUnit &SU) const { + const unsigned NumUpperBound = DAG->SUnits.size(); + return SU.NodeNum < FirstTopFixedSU.value_or(NumUpperBound) && + SU.NodeNum < FirstBotFixedSU.value_or(NumUpperBound); +} + bool AIEPostRASchedStrategy::isAvailableNode(SUnit &SU, SchedBoundary &Zone, bool /*VerifyReadyCycle*/) { + // Note we use signed integers to avoid wrap-around behavior. + const int MinDelta = -getMaxDeltaCycles(Zone); + const int ReadyCycle = std::max(Zone.getCurrCycle(), SU.BotReadyCycle); + const int CurrCycle = Zone.getCurrCycle(); + + // If the Zone has remaining fixed instructions, only one SU is available. + if (SUnit *FixedSU = getNextUnscheduledFixedInstr(Zone)) { + assert(!Zone.isTop() && "Fixed instructions only expected in Bot zone"); + const int DeltaCycles = CurrCycle - ReadyCycle; + return FixedSU == &SU && DeltaCycles >= MinDelta; + } + + // If SU is a fixed instruction in the other zone, it isn't available + if (isFixedSU(SU, !Zone.isTop())) + return false; + // Whether or not the zone is Top or Bot, verify if SU is ready to be // scheduled in terms of cycle. if (Zone.isTop()) return MachineSchedStrategy::isAvailableNode(SU, Zone, /*VerifyReadyCycle=*/true); - // Note we use signed integers to avoid wrap-around behavior. - const int MinDelta = -getMaxDeltaCycles(Zone); - const int ReadyCycle = std::max(Zone.getCurrCycle(), SU.BotReadyCycle); - const int CurrCycle = Zone.getCurrCycle(); - for (int DeltaCycles = CurrCycle - ReadyCycle; DeltaCycles >= MinDelta; --DeltaCycles) { // ReadyCycle is always greater or equal to the current cycle, @@ -542,6 +594,10 @@ void AIEPostRASchedStrategy::enterMBB(MachineBasicBlock *MBB) { // from a block is the bottom one. We reset this when leaving any // region IsBottomRegion = true; + + // The block may have a timed region, append its instructions. + auto &BS = InterBlock.getBlockState(MBB); + InterBlock.emitInterBlockBottom(BS); } void AIEPostRASchedStrategy::commitBlockSchedule(MachineBasicBlock *BB) { @@ -551,10 +607,11 @@ void AIEPostRASchedStrategy::commitBlockSchedule(MachineBasicBlock *BB) { // scheduling region. assert(BS.getRegions().empty() || 0 == BS.getTop().getNumTopFixedInstructions()); - assert(BS.getRegions().empty() || - 0 == BS.getBottom().getNumBotFixedInstructions()); + assert(BS.BottomInsert.empty() || + BS.BottomInsert.size() == BS.getBottom().getNumBotFixedInstructions()); // Safety margin, swp epilogue + // Note that the prologue is handled in a different way. See enterMBB. InterBlock.emitInterBlockTop(BS); if (BS.isPipelined()) { @@ -582,8 +639,6 @@ void AIEPostRASchedStrategy::commitBlockSchedule(MachineBasicBlock *BB) { AIEHazardRecognizer::applyBundles(Region.Bundles, BS.TheBlock); } } - // swp prologue - InterBlock.emitInterBlockBottom(BS); } void AIEPostRASchedStrategy::leaveMBB() { @@ -634,6 +689,9 @@ void AIEPostRASchedStrategy::leaveRegion(const SUnit &ExitSU) { RegionBegin = nullptr; RegionEnd = nullptr; IsBottomRegion = false; + FirstTopFixedSU = {}; + FirstBotFixedSU = {}; + LastBotFixedSU = {}; BS.advanceRegion(); DEBUG_BLOCKS(dbgs() << " << leaveRegion\n"); } @@ -758,6 +816,11 @@ bool AIEPostRASchedStrategy::tryCandidate(SchedCandidate &Cand, return true; } + SchedBoundary &Zone = getSchedZone(); + assert(!getNextUnscheduledFixedInstr(Zone) && + "More than one available SUnit while not all fixed instructions have " + "been emitted."); + // Instructions with delay slots are critical and should be scheduled // as soon as they are ready. if (TryCand.SU->getInstr()->hasDelaySlot()) { @@ -770,8 +833,6 @@ bool AIEPostRASchedStrategy::tryCandidate(SchedCandidate &Cand, return false; } - SchedBoundary &Zone = getSchedZone(); - // Avoid serializing long latency dependence chains. if (Cand.Policy.ReduceLatency && Zone.isTop() && tryLatency(TryCand, Cand, Zone)) { @@ -1240,6 +1301,11 @@ void llvm::AIEPostRASchedStrategy::buildGraph(ScheduleDAGMI &DAG, AAResults *AA, PressureDiffs *PDiffs, LiveIntervals *LIS, bool TrackLaneMasks) { + + // Let's save the DAG already instead of waiting for initialize(). + // Some DAG mutators might require a DAG to be set. + this->DAG = &DAG; + /// We are called after enterRegion, which will have recorded the semantic /// order. We can't use the basic block order, since this may have changed /// in earlier iterations of scheduling @@ -1270,6 +1336,30 @@ void llvm::AIEPostRASchedStrategy::buildGraph(ScheduleDAGMI &DAG, AAResults *AA, static_cast(DAG).recordDbgInstrs(Region); } +void AIEPostRASchedStrategy::addFixedSUnit(MachineInstr &MI, bool IsTop, + unsigned Dist) { + assert(!(IsTop && FirstBotFixedSU) && "Top-fixed SUnits must be added first"); + assert(DAG->SUnits.size() < DAG->SUnits.capacity() && + "SUnits need to be re-allocated."); + unsigned SUNum = DAG->initSUnit(MI).value(); + SUnit &SU = DAG->SUnits[SUNum]; + + if (IsTop) { + SDep Dep(&DAG->EntrySU, SDep::Artificial); + Dep.setLatency(Dist); + SU.addPred(Dep); + if (!FirstTopFixedSU) + FirstTopFixedSU = SUNum; + } else { + SDep Dep(&SU, SDep::Artificial); + Dep.setLatency(Dist); + DAG->ExitSU.addPred(Dep); + if (!FirstBotFixedSU) + FirstBotFixedSU = SUNum; + LastBotFixedSU = SUNum; + } +} + bool AIEScheduleDAGMI::mayAlias(SUnit *SUa, SUnit *SUb, bool UseTBAA) { BlockState &BS = getSchedImpl()->getInterBlock().getBlockState(getBB()); if (BS.FixPoint.Stage == SchedulingStage::Pipelining) { diff --git a/llvm/lib/Target/AIE/AIEMachineScheduler.h b/llvm/lib/Target/AIE/AIEMachineScheduler.h index f60bec3df061..17068c1afe99 100644 --- a/llvm/lib/Target/AIE/AIEMachineScheduler.h +++ b/llvm/lib/Target/AIE/AIEMachineScheduler.h @@ -72,6 +72,18 @@ class AIEPostRASchedStrategy : public PostGenericScheduler { RegPressureTracker *RPTracker, PressureDiffs *PDiffs, LiveIntervals *LIS, bool TrackLaneMasks) override; + /// Adds a SUnit for the given fixed instruction + /// \param IsTop Whether MI is fixed at the top or bottom of the region + /// \param Dist Distance from the top or bottom + void addFixedSUnit(MachineInstr &MI, bool IsTop, unsigned Dist); + + /// Whether \p SU is fixed in a specific cycle of the given zone. + bool isFixedSU(const SUnit &SU, bool IsTop) const; + + /// Whether \p SU is free to be scheduled anywhere in the region. + /// (modulo dependencies and resource conflicts) + bool isFreeSU(const SUnit &SU) const; + /// Explicitly process regions backwards. The first scheduled region in /// a block connects with successors. bool doMBBSchedRegionsTopDown() const override { return false; } @@ -119,6 +131,16 @@ class AIEPostRASchedStrategy : public PostGenericScheduler { /// cycle of instructions to be scheduled. int getMaxDeltaCycles(const SchedBoundary &Zone) const; + /// Return the next "fixed" instruction to place down. + SUnit *getNextUnscheduledFixedInstr(const SchedBoundary &Zone) const; + + /// SU numbers for fixed instructions. + /// "top" fixed SUnits belong in [FirstTopFixedSU,FirstBotFixedSU) + /// "bot" fixed SUnits belong in [FirstBotFixedSU,LastBotFixedSU] + std::optional FirstTopFixedSU; + std::optional FirstBotFixedSU; + std::optional LastBotFixedSU; + /// Keeps track of the current zone used for scheduling. See getSchedZone(). bool IsTopDown = true; diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/bitwisenot.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/bitwisenot.mir index ea25f83b89a5..a0b6a1c2738f 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/bitwisenot.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/bitwisenot.mir @@ -13,19 +13,16 @@ define dso_local void @bitNot(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 { ; CHECK-LABEL: bitNot: ; CHECK: // %bb.0: - ; CHECK-NEXT: add.nc lc, r0, #-5 - ; CHECK-NEXT: movxm ls, #.LBB0_2 - ; CHECK-NEXT: movxm le, #.L_LEnd0 - ; CHECK-NEXT: vldb wh0, [p0, #32]; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; vldb wh0, [p0, #32]; nopx ; add.nc lc, r0, #-5 + ; CHECK-NEXT: vldb wl0, [p0], #64; movxm ls, #.LBB0_2 + ; CHECK-NEXT: vldb wh0, [p0, #32]; movxm le, #.L_LEnd0 ; CHECK-NEXT: vldb wl0, [p0], #64; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: vldb wh0, [p0, #32]; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: vldb wl0, [p0], #64; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: vldb wh0, [p0, #32]; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: vldb wl0, [p0], #64; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb wh0, [p0, #32]; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb wl0, [p0], #64; nopx - ; CHECK-NEXT: vldb wh0, [p0, #32]; vbneg_ltz.s16 x1, r21, x0 - ; CHECK-NEXT: vldb wl0, [p0], #64 + ; CHECK-NEXT: vldb wh0, [p0, #32]; nopa ; nops ; nopx ; vbneg_ltz.s16 x1, r21, x0; nopv + ; CHECK-NEXT: vldb wl0, [p0], #64; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: .LBB0_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldb wh0, [p0, #32]; nopa ; vst wh1, [p1, #32]; nopx ; vbneg_ltz.s16 x1, r21, x0; nopv diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/bitwisexor.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/bitwisexor.mir index d3a786e215fa..2990b88b8dec 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/bitwisexor.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/bitwisexor.mir @@ -26,24 +26,21 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: add.nc lc, r0, #-3 - ; CHECK-NEXT: movxm ls, #.LBB0_2 - ; CHECK-NEXT: movxm le, #.L_LEnd0 - ; CHECK-NEXT: vldb wh0, [p1, #32]; vlda wh1, [p0, #32]; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb wl0, [p1], #64; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb wl1, [p0], #64; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb wh0, [p1, #32]; vlda wh1, [p0, #32]; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb wl0, [p1], #64; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb wl1, [p0], #64; nopx - ; CHECK-NEXT: vbneg_ltz.s8 x2, r25:r24, x0 - ; CHECK-NEXT: vbneg_ltz.s8 x3, r25:r24, x1 - ; CHECK-NEXT: vlda wh1, [p0, #32]; vldb wh0, [p1, #32]; vband x4, x0, x3 - ; CHECK-NEXT: vldb wl0, [p1], #64; vband x5, x1, x2 - ; CHECK-NEXT: vldb wl1, [p0], #64; vbor x6, x4, x5 - ; CHECK-NEXT: vbneg_ltz.s8 x2, r25:r24, x0 - ; CHECK-NEXT: vst wh6, [p2, #32]; vbneg_ltz.s8 x3, r25:r24, x1 + ; CHECK-NEXT: vlda wh1, [p0, #32]; vldb wh0, [p1, #32] + ; CHECK-NEXT: vldb wl0, [p1], #64 + ; CHECK-NEXT: vldb wl1, [p0], #64 + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: vlda wh1, [p0, #32]; vldb wh0, [p1, #32]; add.nc lc, r0, #-3 + ; CHECK-NEXT: vldb wl0, [p1], #64; movxm ls, #.LBB0_2 + ; CHECK-NEXT: vldb wl1, [p0], #64; movxm le, #.L_LEnd0 + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vbneg_ltz.s8 x2, r25:r24, x0; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vbneg_ltz.s8 x3, r25:r24, x1; nopv + ; CHECK-NEXT: vldb wh0, [p1, #32]; vlda wh1, [p0, #32]; nops ; nopx ; vband x4, x0, x3; nopv + ; CHECK-NEXT: vldb wl0, [p1], #64; nopa ; nops ; nopx ; vband x5, x1, x2; nopv + ; CHECK-NEXT: vldb wl1, [p0], #64; nopa ; nops ; nopx ; vbor x6, x4, x5; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vbneg_ltz.s8 x2, r25:r24, x0; nopv + ; CHECK-NEXT: nopb ; nopa ; vst wh6, [p2, #32]; nopx ; vbneg_ltz.s8 x3, r25:r24, x1; nopv ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d.mir index 3ac1a241beac..06aa9aeb6418 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d.mir @@ -71,7 +71,7 @@ ; CHECK-NEXT: .LBB0_1: // %outer.loop.header ; CHECK-NEXT: // =>This Loop Header: Depth=1 ; CHECK-NEXT: // Child Loop BB0_2 Depth 2 - ; CHECK-NEXT: vlda.ups.s32.s16 bmh1, s0, [p2, #32]; mov m1, p4 + ; CHECK-NEXT: nopb ; vlda.ups.s32.s16 bmh1, s0, [p2, #32]; nops ; nopx ; mov m1, p4; nopv ; CHECK-NEXT: vlda.ups.s32.s16 bml1, s0, [p2], m1 ; CHECK-NEXT: vlda.ups.s32.s16 bmh2, s0, [p2, #32]; mov m2, p5 ; CHECK-NEXT: vlda.ups.s32.s16 bml2, s0, [p2], m2 @@ -87,24 +87,20 @@ ; CHECK-NEXT: vlda.ups.s32.s16 bml7, s0, [p2], m1 ; CHECK-NEXT: vlda.ups.s32.s16 bmh0, s0, [p2, #32] ; CHECK-NEXT: vlda.ups.s32.s16 bml0, s0, [p2, #0]; mov r0, p0 - ; CHECK-NEXT: movxm ls, #.LBB0_2 - ; CHECK-NEXT: movxm le, #.L_LEnd0 - ; CHECK-NEXT: and r0, r0, r9 - ; CHECK-NEXT: add r0, r0, #33; add.nc lc, r5, #-2 - ; CHECK-NEXT: vldb wl6, [p0], m6; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb wh6, [p0], m6; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb wl8, [p0], m6; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb.3d wh8, [p0], d0; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb wl10, [p1], #32; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb wh10, [p1], #32; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopa ; vldb wl7, [p1], #32; nopx - ; CHECK-NEXT: vldb wh7, [p1], #32; mov r6, p0 - ; CHECK-NEXT: vldb wl6, [p0], m6; and r5, r6, r9; vshift.align x4, x4, s1, x6, r0 + ; CHECK-NEXT: vldb wl6, [p0], m6 ; CHECK-NEXT: vldb wh6, [p0], m6 - ; CHECK-NEXT: vldb wl8, [p0], m6; add r0, r5, #33; vshift.align x2, x2, s1, x8, r0 - ; CHECK-NEXT: vldb.3d wh8, [p0], d0; vshuffle x9, x4, x2, r2 - ; CHECK-NEXT: vldb wl10, [p1], #32; vshuffle x1, x9, x0, r8 - ; CHECK-NEXT: vldb wh10, [p1], #32; vshuffle x3, x4, x2, r3; vmac cm1, cm1, x9, x10, r4 + ; CHECK-NEXT: vldb wl8, [p0], m6 + ; CHECK-NEXT: vldb.3d wh8, [p0], d0 + ; CHECK-NEXT: vldb wl10, [p1], #32; movxm ls, #.LBB0_2 + ; CHECK-NEXT: vldb wh10, [p1], #32; movxm le, #.L_LEnd0 + ; CHECK-NEXT: vldb wl7, [p1], #32; and r0, r0, r9; add.nc lc, r5, #-2 + ; CHECK-NEXT: vldb wh7, [p1], #32; nopa ; nops ; add r0, r0, #33; mov r6, p0; nopv + ; CHECK-NEXT: vldb wl6, [p0], m6; nopa ; nops ; and r5, r6, r9; vshift.align x4, x4, s1, x6, r0; nopv + ; CHECK-NEXT: vldb wh6, [p0], m6; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wl8, [p0], m6; nopa ; nops ; add r0, r5, #33; vshift.align x2, x2, s1, x8, r0; nopv + ; CHECK-NEXT: vldb.3d wh8, [p0], d0; nopa ; nops ; nopx ; vshuffle x9, x4, x2, r2; nopv + ; CHECK-NEXT: vldb wl10, [p1], #32; nopa ; nops ; nopx ; vshuffle x1, x9, x0, r8; nopv + ; CHECK-NEXT: vldb wh10, [p1], #32; nopx ; vshuffle x3, x4, x2, r3; vmac cm1, cm1, x9, x10, r4 ; CHECK-NEXT: vldb wl7, [p1], #32; vshuffle x5, x3, x0, r8; vmac cm2, cm2, x1, x10, r4 ; CHECK-NEXT: vldb wh7, [p1], #32; mov r6, p0; vmac cm3, cm3, x3, x10, r4 ; CHECK-NEXT: .p2align 4 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/crash.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/crash.mir index 3a3c9bc4b772..655cdee89a7a 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/crash.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/crash.mir @@ -8,7 +8,7 @@ # RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s -o - | FileCheck %s # This crashed the postpipeliner because it reaches NCopies=1 which causes an out of -# bound access when setting up LCD heuristics. +# bound access when setting up LCD heuristics. # The filecheck reference is the unpipelined loop --- | diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/hardsigmoid-templated.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/hardsigmoid-templated.mir index b25fae85313e..abd58919e69f 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/hardsigmoid-templated.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/hardsigmoid-templated.mir @@ -13,23 +13,20 @@ define dso_local void @kernel(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 { ; CHECK-LABEL: kernel: ; CHECK: // %bb.0: - ; CHECK-NEXT: add.nc lc, r0, #-7 - ; CHECK-NEXT: movxm ls, #.LBB0_2 - ; CHECK-NEXT: movxm le, #.L_LEnd0 ; CHECK-NEXT: vldb.unpack.s16.s8 x6, [p0], m0; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb.unpack.s16.s8 x6, [p0], m0; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb.unpack.s16.s8 x6, [p0], m0; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb.unpack.s16.s8 x6, [p0], m0; nopx - ; CHECK-NEXT: vmin_ge.s16 x8, r16, x6, x0 - ; CHECK-NEXT: vldb.unpack.s16.s8 x6, [p0], m0; vmax_lt.s16 x10, r16, x8, x2 - ; CHECK-NEXT: vmin_ge.s16 x8, r16, x6, x0 - ; CHECK-NEXT: vldb.unpack.s16.s8 x6, [p0], m0; vmax_lt.s16 x10, r16, x8, x2; vmac cm1, cm0, x10, x4, r0 - ; CHECK-NEXT: vmin_ge.s16 x8, r16, x6, x0 - ; CHECK-NEXT: vldb.unpack.s16.s8 x6, [p0], m0; vmax_lt.s16 x10, r16, x8, x2; vmac cm1, cm0, x10, x4, r0 - ; CHECK-NEXT: vmin_ge.s16 x8, r16, x6, x0 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vldb.unpack.s16.s8 x6, [p0], m0 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vldb.unpack.s16.s8 x6, [p0], m0; add.nc lc, r0, #-7 + ; CHECK-NEXT: nop ; movxm ls, #.LBB0_2 + ; CHECK-NEXT: vldb.unpack.s16.s8 x6, [p0], m0; movxm le, #.L_LEnd0 + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmin_ge.s16 x8, r16, x6, x0; nopv + ; CHECK-NEXT: vldb.unpack.s16.s8 x6, [p0], m0; nopa ; nops ; nopx ; vmax_lt.s16 x10, r16, x8, x2; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmin_ge.s16 x8, r16, x6, x0; nopv + ; CHECK-NEXT: vldb.unpack.s16.s8 x6, [p0], m0; nopa ; nops ; nopx ; vmax_lt.s16 x10, r16, x8, x2; vmac cm1, cm0, x10, x4, r0 + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmin_ge.s16 x8, r16, x6, x0; nopv + ; CHECK-NEXT: vldb.unpack.s16.s8 x6, [p0], m0; nopa ; nops ; nopx ; vmax_lt.s16 x10, r16, x8, x2; vmac cm1, cm0, x10, x4, r0 + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmin_ge.s16 x8, r16, x6, x0; nopv ; CHECK-NEXT: .LBB0_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldb.unpack.s16.s8 x6, [p0], m0; nopa ; nops ; nopx ; vmax_lt.s16 x10, r16, x8, x2; vmac cm1, cm0, x10, x4, r0 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/interleave-prologue.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/interleave-prologue.mir index d6741b2de8e5..766c693067ce 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/interleave-prologue.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/interleave-prologue.mir @@ -20,18 +20,20 @@ body: | ; CHECK-NEXT: liveins: $p0, $p1, $r0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $lc = ADD_NC $r0, -7 - ; CHECK-NEXT: BUNDLE implicit-def $r1, implicit-def $ls { - ; CHECK-NEXT: $r1 = MOVA_lda_cg 0 - ; CHECK-NEXT: $ls = MOVXM_lng_cg %bb.2 - ; CHECK-NEXT: } - ; CHECK-NEXT: $x0 = VBCST_16 killed $r1 + ; CHECK-NEXT: $ls = MOVXM_lng_cg %bb.2 ; CHECK-NEXT: $le = MOVXM_lng_cg ; CHECK-NEXT: $x6, $p0 = VLDB_UNPACK_S16_S8_ag_pstm_nrm killed $p0, $m0 ; CHECK-NEXT: $x6, $p0 = VLDB_UNPACK_S16_S8_ag_pstm_nrm killed $p0, $m0 ; CHECK-NEXT: $x6, $p0 = VLDB_UNPACK_S16_S8_ag_pstm_nrm killed $p0, $m0 ; CHECK-NEXT: $x6, $p0 = VLDB_UNPACK_S16_S8_ag_pstm_nrm killed $p0, $m0 - ; CHECK-NEXT: $x6, $p0 = VLDB_UNPACK_S16_S8_ag_pstm_nrm killed $p0, $m0 - ; CHECK-NEXT: $x6, $p0 = VLDB_UNPACK_S16_S8_ag_pstm_nrm killed $p0, $m0 + ; CHECK-NEXT: BUNDLE implicit-def $r1, implicit-def $x6, implicit-def $wl6, implicit-def $wh6, implicit-def $p0, implicit killed $p0, implicit $m0 { + ; CHECK-NEXT: $r1 = MOVA_lda_cg 0 + ; CHECK-NEXT: $x6, $p0 = VLDB_UNPACK_S16_S8_ag_pstm_nrm killed $p0, $m0 + ; CHECK-NEXT: } + ; CHECK-NEXT: BUNDLE implicit-def $x6, implicit-def $wl6, implicit-def $wh6, implicit-def $p0, implicit-def $x0, implicit-def $wl0, implicit-def $wh0, implicit killed $p0, implicit $m0, implicit killed $r1 { + ; CHECK-NEXT: $x6, $p0 = VLDB_UNPACK_S16_S8_ag_pstm_nrm killed $p0, $m0 + ; CHECK-NEXT: $x0 = VBCST_16 killed $r1 + ; CHECK-NEXT: } ; CHECK-NEXT: $x6, $p0 = VLDB_UNPACK_S16_S8_ag_pstm_nrm killed $p0, $m0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: @@ -100,13 +102,11 @@ body: | ; CHECK-NEXT: liveins: $p0, $p1, $r0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $lc = ADD_NC $r0, -8 - ; CHECK-NEXT: BUNDLE implicit-def $r1, implicit-def $ls { - ; CHECK-NEXT: $r1 = MOVA_lda_cg 0 - ; CHECK-NEXT: $ls = MOVXM_lng_cg %bb.2 + ; CHECK-NEXT: $ls = MOVXM_lng_cg %bb.2 + ; CHECK-NEXT: BUNDLE implicit-def $wl4, implicit-def $p0, implicit-def $le, implicit killed $p0, implicit $m0 { + ; CHECK-NEXT: $wl4, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm_imm killed $p0, $m0 + ; CHECK-NEXT: $le = MOVXM_lng_cg ; CHECK-NEXT: } - ; CHECK-NEXT: $x0 = VBCST_16 killed $r1 - ; CHECK-NEXT: $le = MOVXM_lng_cg - ; CHECK-NEXT: $wl4, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm_imm killed $p0, $m0 ; CHECK-NEXT: BUNDLE implicit-def $wl4, implicit-def $p0, implicit-def $x6, implicit-def $wl6, implicit-def $wh6, implicit killed $p0, implicit $m0 { ; CHECK-NEXT: $wl4, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm_imm killed $p0, $m0 ; CHECK-NEXT: $x6 = VUNPACK_S16_S8 internal $wl4 @@ -123,13 +123,15 @@ body: | ; CHECK-NEXT: $wl4, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm_imm killed $p0, $m0 ; CHECK-NEXT: $x6 = VUNPACK_S16_S8 internal $wl4 ; CHECK-NEXT: } - ; CHECK-NEXT: BUNDLE implicit-def $wl4, implicit-def $p0, implicit-def $x6, implicit-def $wl6, implicit-def $wh6, implicit killed $p0, implicit $m0 { + ; CHECK-NEXT: BUNDLE implicit-def $wl4, implicit-def $p0, implicit-def $x6, implicit-def $wl6, implicit-def $wh6, implicit-def $r1, implicit killed $p0, implicit $m0 { ; CHECK-NEXT: $wl4, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm_imm killed $p0, $m0 ; CHECK-NEXT: $x6 = VUNPACK_S16_S8 internal $wl4 + ; CHECK-NEXT: $r1 = MOVX_alu_cg 0 ; CHECK-NEXT: } - ; CHECK-NEXT: BUNDLE implicit-def $wl4, implicit-def $p0, implicit-def $x6, implicit-def $wl6, implicit-def $wh6, implicit killed $p0, implicit $m0 { + ; CHECK-NEXT: BUNDLE implicit-def $wl4, implicit-def $p0, implicit-def $x6, implicit-def $wl6, implicit-def $wh6, implicit-def $x0, implicit-def $wl0, implicit-def $wh0, implicit killed $p0, implicit $m0, implicit killed $r1 { ; CHECK-NEXT: $wl4, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm_imm killed $p0, $m0 ; CHECK-NEXT: $x6 = VUNPACK_S16_S8 internal $wl4 + ; CHECK-NEXT: $x0 = VBCST_16 killed $r1 ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE implicit-def $wl4, implicit-def $p0, implicit-def $x6, implicit-def $wl6, implicit-def $wh6, implicit killed $p0, implicit $m0 { ; CHECK-NEXT: $wl4, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm_imm killed $p0, $m0 @@ -208,11 +210,7 @@ body: | ; CHECK-NEXT: liveins: $p0, $p1, $r0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $lc = ADD_NC $r0, -7 - ; CHECK-NEXT: BUNDLE implicit-def $r1, implicit-def $ls { - ; CHECK-NEXT: $r1 = MOVA_lda_cg 0 - ; CHECK-NEXT: $ls = MOVXM_lng_cg %bb.2 - ; CHECK-NEXT: } - ; CHECK-NEXT: $x0 = VBCST_16 killed $r1 + ; CHECK-NEXT: $ls = MOVXM_lng_cg %bb.2 ; CHECK-NEXT: $le = MOVXM_lng_cg ; CHECK-NEXT: BUNDLE implicit-def $wh6, implicit-def $wl6, implicit-def $p0, implicit killed $p0 { ; CHECK-NEXT: $wh6 = VLDA_dmw_lda_w_ag_idx_imm $p0, 32 :: (load (<16 x s16>)) @@ -230,13 +228,15 @@ body: | ; CHECK-NEXT: $wh6 = VLDA_dmw_lda_w_ag_idx_imm $p0, 32 :: (load (<16 x s16>)) ; CHECK-NEXT: $wl6, $p0 = VLDB_dmw_ldb_ag_pstm_nrm_imm killed $p0, 64 :: (load (<16 x s16>)) ; CHECK-NEXT: } - ; CHECK-NEXT: BUNDLE implicit-def $wh6, implicit-def $wl6, implicit-def $p0, implicit killed $p0 { + ; CHECK-NEXT: BUNDLE implicit-def $wh6, implicit-def $wl6, implicit-def $p0, implicit-def $r1, implicit killed $p0 { ; CHECK-NEXT: $wh6 = VLDA_dmw_lda_w_ag_idx_imm $p0, 32 :: (load (<16 x s16>)) ; CHECK-NEXT: $wl6, $p0 = VLDB_dmw_ldb_ag_pstm_nrm_imm killed $p0, 64 :: (load (<16 x s16>)) + ; CHECK-NEXT: $r1 = MOVX_alu_cg 0 ; CHECK-NEXT: } - ; CHECK-NEXT: BUNDLE implicit-def $wh6, implicit-def $wl6, implicit-def $p0, implicit killed $p0 { + ; CHECK-NEXT: BUNDLE implicit-def $wh6, implicit-def $wl6, implicit-def $p0, implicit-def $x0, implicit-def $wl0, implicit-def $wh0, implicit killed $p0, implicit killed $r1 { ; CHECK-NEXT: $wh6 = VLDA_dmw_lda_w_ag_idx_imm $p0, 32 :: (load (<16 x s16>)) ; CHECK-NEXT: $wl6, $p0 = VLDB_dmw_ldb_ag_pstm_nrm_imm killed $p0, 64 :: (load (<16 x s16>)) + ; CHECK-NEXT: $x0 = VBCST_16 killed $r1 ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE implicit-def $wh6, implicit-def $wl6, implicit-def $p0, implicit killed $p0 { ; CHECK-NEXT: $wh6 = VLDA_dmw_lda_w_ag_idx_imm $p0, 32 :: (load (<16 x s16>)) @@ -314,30 +314,36 @@ body: | ; CHECK-NEXT: $r8 = LDA_dms_lda_idx_imm killed $p3, 0 ; CHECK-NEXT: $bmh2 = VLDA_UPS_S32_S16_ag_idx_imm $s0, $p2, 32, implicit-def $srups_of, implicit $crsat ; CHECK-NEXT: $bml2 = VLDA_UPS_S32_S16_ag_idx_imm $s0, killed $p2, 0, implicit-def $srups_of, implicit $crsat - ; CHECK-NEXT: NOP - ; CHECK-NEXT: NOP - ; CHECK-NEXT: $lc = ADD_NC killed $r0, -5 - ; CHECK-NEXT: $ls = MOVXM_lng_cg %bb.2 - ; CHECK-NEXT: $le = MOVXM_lng_cg - ; CHECK-NEXT: $x1 = VBCST_16 killed $r1 - ; CHECK-NEXT: BUNDLE implicit-def $r4, implicit-def $x2, implicit-def $wl2, implicit-def $wh2, implicit killed $r2 { - ; CHECK-NEXT: $r4 = MOVA_lda_cg 0 - ; CHECK-NEXT: $x2 = VBCST_16 killed $r2 + ; CHECK-NEXT: BUNDLE implicit-def $wh6, implicit-def $lc, implicit $p0, implicit killed $r0 { + ; CHECK-NEXT: $wh6 = VLDA_dmw_lda_w_ag_idx_imm $p0, 32 + ; CHECK-NEXT: $lc = ADD_NC killed $r0, -5 + ; CHECK-NEXT: } + ; CHECK-NEXT: BUNDLE implicit-def $wl6, implicit-def $p0, implicit-def $ls, implicit killed $p0, implicit $m0 { + ; CHECK-NEXT: $wl6, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm_imm killed $p0, $m0 + ; CHECK-NEXT: $ls = MOVXM_lng_cg %bb.2 + ; CHECK-NEXT: } + ; CHECK-NEXT: BUNDLE implicit-def $wh6, implicit-def $le, implicit $p0 { + ; CHECK-NEXT: $wh6 = VLDA_dmw_lda_w_ag_idx_imm $p0, 32 + ; CHECK-NEXT: $le = MOVXM_lng_cg ; CHECK-NEXT: } - ; CHECK-NEXT: $wh6 = VLDA_dmw_lda_w_ag_idx_imm $p0, 32 - ; CHECK-NEXT: $wl6, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm_imm killed $p0, $m0 - ; CHECK-NEXT: $wh6 = VLDA_dmw_lda_w_ag_idx_imm $p0, 32 - ; CHECK-NEXT: $wl6, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm_imm killed $p0, $m0 - ; CHECK-NEXT: $wh6 = VLDA_dmw_lda_w_ag_idx_imm $p0, 32 ; CHECK-NEXT: $wl6, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm_imm killed $p0, $m0 ; CHECK-NEXT: $wh6 = VLDA_dmw_lda_w_ag_idx_imm $p0, 32 ; CHECK-NEXT: $wl6, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm_imm killed $p0, $m0 + ; CHECK-NEXT: BUNDLE implicit-def $wh6, implicit-def $x1, implicit-def $wl1, implicit-def $wh1, implicit $p0, implicit killed $r1 { + ; CHECK-NEXT: $wh6 = VLDA_dmw_lda_w_ag_idx_imm $p0, 32 + ; CHECK-NEXT: $x1 = VBCST_16 killed $r1 + ; CHECK-NEXT: } + ; CHECK-NEXT: BUNDLE implicit-def $wl6, implicit-def $p0, implicit-def $x2, implicit-def $wl2, implicit-def $wh2, implicit killed $p0, implicit $m0, implicit killed $r2 { + ; CHECK-NEXT: $wl6, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm_imm killed $p0, $m0 + ; CHECK-NEXT: $x2 = VBCST_16 killed $r2 + ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE implicit-def $wh6, implicit-def $x5, implicit-def $wl5, implicit-def $wh5, implicit $p0, implicit killed $x6, implicit $x1, implicit $r8 { ; CHECK-NEXT: $wh6 = VLDA_dmw_lda_w_ag_idx_imm $p0, 32 ; CHECK-NEXT: $x5 = VSHUFFLE killed $x6, $x1, $r8 ; CHECK-NEXT: } - ; CHECK-NEXT: BUNDLE implicit-def $wl6, implicit-def $p0, implicit-def $x5, implicit-def $wl5, implicit-def $wh5, implicit killed $p0, implicit $m0, implicit killed $x5, implicit $x2, implicit $r8 { + ; CHECK-NEXT: BUNDLE implicit-def $wl6, implicit-def $p0, implicit-def $r4, implicit-def $x5, implicit-def $wl5, implicit-def $wh5, implicit killed $p0, implicit $m0, implicit killed $x5, implicit $x2, implicit $r8 { ; CHECK-NEXT: $wl6, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm_imm killed $p0, $m0 + ; CHECK-NEXT: $r4 = MOVX_alu_cg 0 ; CHECK-NEXT: $x5 = VSHUFFLE killed $x5, $x2, $r8 ; CHECK-NEXT: } ; CHECK-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/large-II.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/large-II.mir index 1372b1a1a7a7..793d9bb52aa2 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/large-II.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/large-II.mir @@ -28,17 +28,16 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: add.nc lc, r0, #-1 - ; CHECK-NEXT: movxm ls, #.LBB0_2 - ; CHECK-NEXT: movxm le, #.L_LEnd0 - ; CHECK-NEXT: nopb ; lda r0, [p0], #4; nops ; nopxm ; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; movxm ls, #.LBB0_2; nopv + ; CHECK-NEXT: nopa ; add.nc lc, r0, #-1 + ; CHECK-NEXT: lda r0, [p0], #4; movxm le, #.L_LEnd0 ; CHECK-NEXT: nopb ; lda r1, [p0], #4; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; lda r2, [p0], #4; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; lda r3, [p0], #4; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; lda r4, [p0], #4; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; lda r5, [p0], #4; nops ; nopxm ; nopv - ; CHECK-NEXT: lda r6, [p0], #4; nopxm - ; CHECK-NEXT: lda r7, [p0], #4; st r0, [p1], #4 + ; CHECK-NEXT: nopb ; lda r6, [p0], #4; nops ; nopxm ; nopv + ; CHECK-NEXT: nopb ; lda r7, [p0], #4; st r0, [p1], #4; nopxm ; nopv ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/ld2-mv-mac2-st2.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/ld2-mv-mac2-st2.mir index 7656c74e6224..a4ad165be6a9 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/ld2-mv-mac2-st2.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/ld2-mv-mac2-st2.mir @@ -23,21 +23,18 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: add.nc lc, r0, #-6 - ; CHECK-NEXT: movxm ls, #.LBB0_2 - ; CHECK-NEXT: movxm le, #.L_LEnd0 - ; CHECK-NEXT: vldb wl4, [p0], m2; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb.2d wl4, [p0], d0; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb wl4, [p0], m2; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb.2d wl4, [p0], d0; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: vldb wl4, [p0], m2; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb.2d wl4, [p0], d0; nopx + ; CHECK-NEXT: vldb wl4, [p0], m2; add.nc lc, r0, #-6 + ; CHECK-NEXT: vldb.2d wl4, [p0], d0; movxm ls, #.LBB0_2 + ; CHECK-NEXT: vldb wl4, [p0], m2; movxm le, #.L_LEnd0 ; CHECK-NEXT: vldb.2d wl4, [p0], d0; nopa ; nops ; nopx ; vmov wh4, wl0; nopv ; CHECK-NEXT: vldb wl4, [p0], m2; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb.2d wl4, [p0], d0; nopx ; vmov wh4, wl0; vmac cm1, cm0, x4, x2, r0 - ; CHECK-NEXT: vldb wl4, [p0], m2; vmac cm2, cm0, x4, x2, r0 - ; CHECK-NEXT: vldb.2d wl4, [p0], d0; vmov wh4, wl0; vmac cm1, cm0, x4, x2, r0 - ; CHECK-NEXT: vldb wl4, [p0], m2; vmac cm2, cm0, x4, x2, r0 - ; CHECK-NEXT: vldb.2d wl4, [p0], d0; vmov wh4, wl0; vmac cm1, cm0, x4, x2, r0 + ; CHECK-NEXT: vldb.2d wl4, [p0], d0; nopa ; nops ; nopx ; vmov wh4, wl0; vmac cm1, cm0, x4, x2, r0 + ; CHECK-NEXT: vldb wl4, [p0], m2; nopa ; nops ; nopxm ; vmac cm2, cm0, x4, x2, r0 + ; CHECK-NEXT: vldb.2d wl4, [p0], d0; nopa ; nops ; nopx ; vmov wh4, wl0; vmac cm1, cm0, x4, x2, r0 + ; CHECK-NEXT: vldb wl4, [p0], m2; nopa ; nops ; nopxm ; vmac cm2, cm0, x4, x2, r0 + ; CHECK-NEXT: vldb.2d wl4, [p0], d0; nopa ; nops ; nopx ; vmov wh4, wl0; vmac cm1, cm0, x4, x2, r0 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/load-add-or-store.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/load-add-or-store.mir index 36b90f018879..9fb4e58fa039 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/load-add-or-store.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/load-add-or-store.mir @@ -25,17 +25,16 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: add.nc lc, r0, #-4 - ; CHECK-NEXT: movxm ls, #.LBB0_2 - ; CHECK-NEXT: movxm le, #.L_LEnd0 - ; CHECK-NEXT: nopb ; lda r0, [p1], #4; nops ; nopxm ; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; movxm ls, #.LBB0_2; nopv + ; CHECK-NEXT: nopa ; add.nc lc, r0, #-4 + ; CHECK-NEXT: lda r0, [p1], #4; movxm le, #.L_LEnd0 ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; lda r0, [p1], #4; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; lda r0, [p1], #4; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: lda r0, [p1], #4; nopb ; nopxm - ; CHECK-NEXT: add r1, r0, #1 + ; CHECK-NEXT: nopb ; lda r0, [p1], #4; nops ; nopxm ; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; add r1, r0, #1; nopm ; nopv ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/load-add-store-renamed.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/load-add-store-renamed.mir index 1ea1e604ca9c..a612670d1e1a 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/load-add-store-renamed.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/load-add-store-renamed.mir @@ -24,17 +24,16 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: add.nc lc, r0, #-8 - ; CHECK-NEXT: movxm ls, #.LBB0_2 - ; CHECK-NEXT: movxm le, #.L_LEnd0 + ; CHECK-NEXT: nopb ; nopa ; nops ; movxm ls, #.LBB0_2; nopv + ; CHECK-NEXT: nopa ; add.nc lc, r0, #-8 + ; CHECK-NEXT: lda r0, [p1], #4; movxm le, #.L_LEnd0 ; CHECK-NEXT: nopb ; lda r0, [p1], #4; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; lda r0, [p1], #4; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; lda r0, [p1], #4; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; lda r0, [p1], #4; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; lda r0, [p1], #4; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; lda r0, [p1], #4; nops ; nopxm ; nopv - ; CHECK-NEXT: lda r0, [p1], #4; nopxm - ; CHECK-NEXT: lda r0, [p1], #4; add r1, r0, #1 + ; CHECK-NEXT: nopb ; lda r0, [p1], #4; nops ; add r1, r0, #1; nopm ; nopv ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/load-add-store.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/load-add-store.mir index 684445c951ec..236f58737c6a 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/load-add-store.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/load-add-store.mir @@ -25,17 +25,16 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: add.nc lc, r0, #-4 - ; CHECK-NEXT: movxm ls, #.LBB0_2 - ; CHECK-NEXT: movxm le, #.L_LEnd0 - ; CHECK-NEXT: nopb ; lda r0, [p1], #4; nops ; nopxm ; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; movxm ls, #.LBB0_2; nopv + ; CHECK-NEXT: nopa ; add.nc lc, r0, #-4 + ; CHECK-NEXT: lda r0, [p1], #4; movxm le, #.L_LEnd0 ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; lda r0, [p1], #4; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; lda r0, [p1], #4; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: lda r0, [p1], #4; nopb ; nopxm - ; CHECK-NEXT: add r0, r0, #1 + ; CHECK-NEXT: nopb ; lda r0, [p1], #4; nops ; nopxm ; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; add r0, r0, #1; nopm ; nopv ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/load-mac-store.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/load-mac-store.mir index 684445c951ec..236f58737c6a 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/load-mac-store.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/load-mac-store.mir @@ -25,17 +25,16 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: add.nc lc, r0, #-4 - ; CHECK-NEXT: movxm ls, #.LBB0_2 - ; CHECK-NEXT: movxm le, #.L_LEnd0 - ; CHECK-NEXT: nopb ; lda r0, [p1], #4; nops ; nopxm ; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; movxm ls, #.LBB0_2; nopv + ; CHECK-NEXT: nopa ; add.nc lc, r0, #-4 + ; CHECK-NEXT: lda r0, [p1], #4; movxm le, #.L_LEnd0 ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; lda r0, [p1], #4; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; lda r0, [p1], #4; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: lda r0, [p1], #4; nopb ; nopxm - ; CHECK-NEXT: add r0, r0, #1 + ; CHECK-NEXT: nopb ; lda r0, [p1], #4; nops ; nopxm ; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; add r0, r0, #1; nopm ; nopv ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round.mir index ee49118a134a..789ef09bdcaa 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round.mir @@ -34,18 +34,16 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: add.nc lc, r0, #-1 - ; CHECK-NEXT: movxm ls, #.LBB0_2 - ; CHECK-NEXT: movxm le, #.L_LEnd0 - ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm0, s0, [p0], #32; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm1, s0, [p0], #32; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; nopb ; nopx ; add.nc lc, r0, #-1 + ; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32; movxm ls, #.LBB0_2 + ; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32; movxm le, #.L_LEnd0 + ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopa ; nopb ; nopxm - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/small-II.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/small-II.mir index 5f7788a680a4..dd4da83e4766 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/small-II.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/small-II.mir @@ -28,17 +28,16 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: add.nc lc, r0, #-2 - ; CHECK-NEXT: movxm ls, #.LBB0_2 - ; CHECK-NEXT: movxm le, #.L_LEnd0 - ; CHECK-NEXT: nopb ; lda r0, [p0], #4; nops ; nopxm ; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; movxm ls, #.LBB0_2; nopv + ; CHECK-NEXT: nopa ; add.nc lc, r0, #-2 + ; CHECK-NEXT: lda r0, [p0], #4; movxm le, #.L_LEnd0 ; CHECK-NEXT: nopb ; lda r1, [p0], #4; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; lda r2, [p0], #4; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; lda r3, [p0], #4; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; lda r0, [p0], #4; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; lda r1, [p0], #4; nops ; nopxm ; nopv - ; CHECK-NEXT: lda r2, [p0], #4; nopxm - ; CHECK-NEXT: lda r3, [p0], #4; st r0, [p1], #4 + ; CHECK-NEXT: nopb ; lda r2, [p0], #4; nops ; nopxm ; nopv + ; CHECK-NEXT: nopb ; lda r3, [p0], #4; st r0, [p1], #4; nopxm ; nopv ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1