Skip to content

Commit

Permalink
[AIEX] Make "bottom-fixed" instructions part of the sched region
Browse files Browse the repository at this point in the history
This means they will be placed by the scheduler, and once they are
placed, other "free" instructions can be bundled with them.
  • Loading branch information
gbossu committed Nov 18, 2024
1 parent 117d23b commit d4c8472
Show file tree
Hide file tree
Showing 18 changed files with 310 additions and 183 deletions.
63 changes: 47 additions & 16 deletions llvm/lib/Target/AIE/AIEBaseSubtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -275,27 +275,14 @@ class RegionEndEdges : public ScheduleDAGMutation {
assert(EdgeLatency < DelaySlots);
EdgeLatency = DelaySlots + 1;
}

// Between writing Registers (lc, le, ls) and the end of the loop,
// there must be a distance of 112 bytes in terms of PM addresses.
// 112 bytes correspond to 7 fully-expanded 128-bit instructions and
// hence adding a latency of 8 from LoopStart to the ExitSU.
// We can subtract the number of bundles that interblock pushed into
// BottomInsert
// FIXME: this holds as long as we insert them unconditionally. If we
// integrate them with the bottom region, we just need to keep 8 away
// from ExitSU
if (TII->isZeroOverheadLoopSetupInstr(MI)) {
unsigned PatchCycles = 8;
if (DAG->getBB()) {
auto *Scheduler =
static_cast<AIEScheduleDAGMI *>(DAG)->getSchedImpl();
auto &InterBlock = Scheduler->getInterBlock();
unsigned InsertedCycles =
InterBlock.getBlockState(DAG->getBB()).BottomInsert.size();
PatchCycles =
PatchCycles >= InsertedCycles ? PatchCycles - InsertedCycles : 0;
}
EdgeLatency = std::max(EdgeLatency, PatchCycles);
const unsigned ZOLDistance = 8;
EdgeLatency = std::max(EdgeLatency, ZOLDistance);
}

ExitDep.setLatency(EdgeLatency);
Expand All @@ -318,6 +305,49 @@ class RegionEndEdges : public ScheduleDAGMutation {
};
};

class EmitFixedSUnits : public ScheduleDAGMutation {
public:
void apply(ScheduleDAGInstrs *DAG) override {
AIEPostRASchedStrategy *Scheduler =
static_cast<AIEScheduleDAGMI *>(DAG)->getSchedImpl();
auto *TII = static_cast<const AIEBaseInstrInfo *>(DAG->TII);
auto *ItinData = DAG->MF.getSubtarget().getInstrItineraryData();
const BlockState &BS =
Scheduler->getInterBlock().getBlockState(DAG->getBB());
const Region &CurRegion = BS.getCurrentRegion();

// First, create SUnits for all "fixed" instructions
unsigned DistToExitSU = 0;
for (MachineInstr &MI : reverse(CurRegion.bot_fixed_instrs())) {
Scheduler->addFixedSUnit(MI, /*IsTop=*/false, DistToExitSU);
++DistToExitSU;
}
DAG->makeMaps();

// Then, create create dependencies between "free" and "fixed" instructions
auto IsFreeSU = [Scheduler](const SUnit &SU) {
return Scheduler->isFreeSU(SU);
};
ArrayRef<AIE::MachineBundle> BotFixedBundles =
CurRegion.getBotFixedBundles();
for (SUnit &FreeSU : make_filter_range(DAG->SUnits, IsFreeSU)) {
const MachineInstr &MI = *FreeSU.getInstr();
MachineInstr *FixedDepMI =
AIE::findEarliestRef(MI, BotFixedBundles, BotFixedBundles.size()).MI;
if (!FixedDepMI)
continue;

SUnit *FixedDepSU =
DAG->getSUnit(&*getBundleStart(FixedDepMI->getIterator()));
assert(FixedDepSU && "Fixed Bundle has no corresponding SU.");
SDep Dep(&FreeSU, SDep::Artificial);
Dep.setLatency(
AIE::maxLatency(&MI, *TII, *ItinData, /*IncludeStages=*/true));
FixedDepSU->addPred(Dep, /*Required=*/true);
}
}
};

/// Collect all "weak" edges in a separate vector. This allows modifying
/// \p SU.Preds without invalidating iterators.
SmallVector<SDep, 4> getWeakPreds(SUnit &SU) {
Expand Down Expand Up @@ -664,6 +694,7 @@ AIEBaseSubtarget::getPostRAMutationsImpl(const Triple &TT) {
Mutations.emplace_back(std::make_unique<MemoryEdges>());
Mutations.emplace_back(std::make_unique<MachineSchedWAWEdges>());
Mutations.emplace_back(std::make_unique<BiasDepth>());
Mutations.emplace_back(std::make_unique<EmitFixedSUnits>());
}
return Mutations;
}
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -717,7 +717,9 @@ void InterBlockScheduling::enterRegion(MachineBasicBlock *BB,
if (BS.Kind != BlockType::Loop ||
BS.FixPoint.Stage == SchedulingStage::GatheringRegions) {
ArrayRef<MachineBundle> TopFixedBundles;
ArrayRef<MachineBundle> BotFixedBundles;
ArrayRef<MachineBundle> BotFixedBundles =
RegionEnd == BB->end() ? ArrayRef<MachineBundle>(BS.BottomInsert)
: ArrayRef<MachineBundle>();
BS.addRegion(BB, RegionBegin, RegionEnd, TopFixedBundles, BotFixedBundles);
}
}
Expand Down
116 changes: 103 additions & 13 deletions llvm/lib/Target/AIE/AIEMachineScheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@ void AIEPostRASchedStrategy::initializeBotScoreBoard(ScoreboardTrust Trust) {
assert(!doMBBSchedRegionsTopDown());
AIEHazardRecognizer *BotHazardRec = getAIEHazardRecognizer(Bot);
const int Depth = BotHazardRec->getMaxLookAhead();
assert(unsigned(Depth) >= BotHazardRec->getPipelineDepth());

/// These lambdas are an abstraction of the scoreboard manipulations,
/// hiding the details of the implementation. In particular, we need to
Expand Down Expand Up @@ -479,27 +480,78 @@ SUnit *AIEPostRASchedStrategy::pickNodeAndCycle(
}

int AIEPostRASchedStrategy::getMaxDeltaCycles(const SchedBoundary &Zone) const {
assert(!Zone.isTop());
if (Zone.getCurrCycle() >= RegionBottomUpCycles - 1)
// Top-down scheduling does not support DeltaCycles
if (Zone.isTop() || Zone.getCurrCycle() >= RegionBottomUpCycles - 1)
return 0;
return std::min({int(RegionBottomUpCycles - 1 - Zone.getCurrCycle()),
int(getAIEHazardRecognizer(Zone)->getMaxLookAhead()),
BottomUpDelta.getValue()});
}

unsigned getNumEmittedInstrs(ScheduleDAGMI *DAG, bool IsTop) {
if (IsTop)
return DAG->top().isValid() ? std::distance(DAG->begin(), DAG->top()) : 0;
return DAG->bottom().isValid() ? std::distance(DAG->bottom(), DAG->end()) : 0;
}

SUnit *AIEPostRASchedStrategy::getNextUnscheduledFixedInstr(
const SchedBoundary &Zone) const {
if (Zone.isTop())
return nullptr;
const Region &Reg = InterBlock.getBlockState(CurMBB).getCurrentRegion();
unsigned NumEmitted = getNumEmittedInstrs(DAG, /*IsTop=*/false);
if (NumEmitted < Reg.getNumBotFixedInstructions()) {
MachineInstr &NextMI =
*std::prev(DAG->bottom().isValid() ? DAG->bottom() : DAG->end());
SUnit *NextSU = DAG->getSUnit(&NextMI);
assert(NextSU);
assert(NextSU->BotReadyCycle == NextSU->getHeight() &&
"Fixed instruction won't be placed at the correct cycle");
assert(Zone.getCurrCycle() <= NextSU->BotReadyCycle);
return NextSU;
}
return nullptr;
}

bool AIEPostRASchedStrategy::isFixedSU(const SUnit &SU, bool IsTop) const {
if (IsTop) {
return FirstTopFixedSU && SU.NodeNum >= *FirstTopFixedSU &&
SU.NodeNum < FirstBotFixedSU.value_or(DAG->SUnits.size());
}
return FirstBotFixedSU && SU.NodeNum >= *FirstBotFixedSU &&
SU.NodeNum <= LastBotFixedSU.value();
}

bool AIEPostRASchedStrategy::isFreeSU(const SUnit &SU) const {
const unsigned NumUpperBound = DAG->SUnits.size();
return SU.NodeNum < FirstTopFixedSU.value_or(NumUpperBound) &&
SU.NodeNum < FirstBotFixedSU.value_or(NumUpperBound);
}

bool AIEPostRASchedStrategy::isAvailableNode(SUnit &SU, SchedBoundary &Zone,
bool /*VerifyReadyCycle*/) {
// Note we use signed integers to avoid wrap-around behavior.
const int MinDelta = -getMaxDeltaCycles(Zone);
const int ReadyCycle = std::max(Zone.getCurrCycle(), SU.BotReadyCycle);
const int CurrCycle = Zone.getCurrCycle();

// If the Zone has remaining fixed instructions, only one SU is available.
if (SUnit *FixedSU = getNextUnscheduledFixedInstr(Zone)) {
assert(!Zone.isTop() && "Fixed instructions only expected in Bot zone");
const int DeltaCycles = CurrCycle - ReadyCycle;
return FixedSU == &SU && DeltaCycles >= MinDelta;
}

// If SU is a fixed instruction in the other zone, it isn't available
if (isFixedSU(SU, !Zone.isTop()))
return false;

// Whether or not the zone is Top or Bot, verify if SU is ready to be
// scheduled in terms of cycle.
if (Zone.isTop())
return MachineSchedStrategy::isAvailableNode(SU, Zone,
/*VerifyReadyCycle=*/true);

// Note we use signed integers to avoid wrap-around behavior.
const int MinDelta = -getMaxDeltaCycles(Zone);
const int ReadyCycle = std::max(Zone.getCurrCycle(), SU.BotReadyCycle);
const int CurrCycle = Zone.getCurrCycle();

for (int DeltaCycles = CurrCycle - ReadyCycle; DeltaCycles >= MinDelta;
--DeltaCycles) {
// ReadyCycle is always greater or equal to the current cycle,
Expand Down Expand Up @@ -542,6 +594,10 @@ void AIEPostRASchedStrategy::enterMBB(MachineBasicBlock *MBB) {
// from a block is the bottom one. We reset this when leaving any
// region
IsBottomRegion = true;

// The block may have a timed region, append its instructions.
auto &BS = InterBlock.getBlockState(MBB);
InterBlock.emitInterBlockBottom(BS);
}

void AIEPostRASchedStrategy::commitBlockSchedule(MachineBasicBlock *BB) {
Expand All @@ -551,10 +607,11 @@ void AIEPostRASchedStrategy::commitBlockSchedule(MachineBasicBlock *BB) {
// scheduling region.
assert(BS.getRegions().empty() ||
0 == BS.getTop().getNumTopFixedInstructions());
assert(BS.getRegions().empty() ||
0 == BS.getBottom().getNumBotFixedInstructions());
assert(BS.BottomInsert.empty() ||
BS.BottomInsert.size() == BS.getBottom().getNumBotFixedInstructions());

// Safety margin, swp epilogue
// Note that the prologue is handled in a different way. See enterMBB.
InterBlock.emitInterBlockTop(BS);

if (BS.isPipelined()) {
Expand Down Expand Up @@ -582,8 +639,6 @@ void AIEPostRASchedStrategy::commitBlockSchedule(MachineBasicBlock *BB) {
AIEHazardRecognizer::applyBundles(Region.Bundles, BS.TheBlock);
}
}
// swp prologue
InterBlock.emitInterBlockBottom(BS);
}

void AIEPostRASchedStrategy::leaveMBB() {
Expand Down Expand Up @@ -634,6 +689,9 @@ void AIEPostRASchedStrategy::leaveRegion(const SUnit &ExitSU) {
RegionBegin = nullptr;
RegionEnd = nullptr;
IsBottomRegion = false;
FirstTopFixedSU = {};
FirstBotFixedSU = {};
LastBotFixedSU = {};
BS.advanceRegion();
DEBUG_BLOCKS(dbgs() << " << leaveRegion\n");
}
Expand Down Expand Up @@ -758,6 +816,11 @@ bool AIEPostRASchedStrategy::tryCandidate(SchedCandidate &Cand,
return true;
}

SchedBoundary &Zone = getSchedZone();
assert(!getNextUnscheduledFixedInstr(Zone) &&
"More than one available SUnit while not all fixed instructions have "
"been emitted.");

// Instructions with delay slots are critical and should be scheduled
// as soon as they are ready.
if (TryCand.SU->getInstr()->hasDelaySlot()) {
Expand All @@ -770,8 +833,6 @@ bool AIEPostRASchedStrategy::tryCandidate(SchedCandidate &Cand,
return false;
}

SchedBoundary &Zone = getSchedZone();

// Avoid serializing long latency dependence chains.
if (Cand.Policy.ReduceLatency && Zone.isTop() &&
tryLatency(TryCand, Cand, Zone)) {
Expand Down Expand Up @@ -1240,6 +1301,11 @@ void llvm::AIEPostRASchedStrategy::buildGraph(ScheduleDAGMI &DAG, AAResults *AA,
PressureDiffs *PDiffs,
LiveIntervals *LIS,
bool TrackLaneMasks) {

// Let's save the DAG already instead of waiting for initialize().
// Some DAG mutators might require a DAG to be set.
this->DAG = &DAG;

/// We are called after enterRegion, which will have recorded the semantic
/// order. We can't use the basic block order, since this may have changed
/// in earlier iterations of scheduling
Expand Down Expand Up @@ -1270,6 +1336,30 @@ void llvm::AIEPostRASchedStrategy::buildGraph(ScheduleDAGMI &DAG, AAResults *AA,
static_cast<AIEScheduleDAGMI &>(DAG).recordDbgInstrs(Region);
}

void AIEPostRASchedStrategy::addFixedSUnit(MachineInstr &MI, bool IsTop,
unsigned Dist) {
assert(!(IsTop && FirstBotFixedSU) && "Top-fixed SUnits must be added first");
assert(DAG->SUnits.size() < DAG->SUnits.capacity() &&
"SUnits need to be re-allocated.");
unsigned SUNum = DAG->initSUnit(MI).value();
SUnit &SU = DAG->SUnits[SUNum];

if (IsTop) {
SDep Dep(&DAG->EntrySU, SDep::Artificial);
Dep.setLatency(Dist);
SU.addPred(Dep);
if (!FirstTopFixedSU)
FirstTopFixedSU = SUNum;
} else {
SDep Dep(&SU, SDep::Artificial);
Dep.setLatency(Dist);
DAG->ExitSU.addPred(Dep);
if (!FirstBotFixedSU)
FirstBotFixedSU = SUNum;
LastBotFixedSU = SUNum;
}
}

bool AIEScheduleDAGMI::mayAlias(SUnit *SUa, SUnit *SUb, bool UseTBAA) {
BlockState &BS = getSchedImpl()->getInterBlock().getBlockState(getBB());
if (BS.FixPoint.Stage == SchedulingStage::Pipelining) {
Expand Down
22 changes: 22 additions & 0 deletions llvm/lib/Target/AIE/AIEMachineScheduler.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,18 @@ class AIEPostRASchedStrategy : public PostGenericScheduler {
RegPressureTracker *RPTracker, PressureDiffs *PDiffs,
LiveIntervals *LIS, bool TrackLaneMasks) override;

/// Adds a SUnit for the given fixed instruction
/// \param IsTop Whether MI is fixed at the top or bottom of the region
/// \param Dist Distance from the top or bottom
void addFixedSUnit(MachineInstr &MI, bool IsTop, unsigned Dist);

/// Whether \p SU is fixed in a specific cycle of the given zone.
bool isFixedSU(const SUnit &SU, bool IsTop) const;

/// Whether \p SU is free to be scheduled anywhere in the region.
/// (modulo dependencies and resource conflicts)
bool isFreeSU(const SUnit &SU) const;

/// Explicitly process regions backwards. The first scheduled region in
/// a block connects with successors.
bool doMBBSchedRegionsTopDown() const override { return false; }
Expand Down Expand Up @@ -119,6 +131,16 @@ class AIEPostRASchedStrategy : public PostGenericScheduler {
/// cycle of instructions to be scheduled.
int getMaxDeltaCycles(const SchedBoundary &Zone) const;

/// Return the next "fixed" instruction to place down.
SUnit *getNextUnscheduledFixedInstr(const SchedBoundary &Zone) const;

/// SU numbers for fixed instructions.
/// "top" fixed SUnits belong in [FirstTopFixedSU,FirstBotFixedSU)
/// "bot" fixed SUnits belong in [FirstBotFixedSU,LastBotFixedSU]
std::optional<unsigned> FirstTopFixedSU;
std::optional<unsigned> FirstBotFixedSU;
std::optional<unsigned> LastBotFixedSU;

/// Keeps track of the current zone used for scheduling. See getSchedZone().
bool IsTopDown = true;

Expand Down
13 changes: 5 additions & 8 deletions llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/bitwisenot.mir
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,16 @@
define dso_local void @bitNot(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 {
; CHECK-LABEL: bitNot:
; CHECK: // %bb.0:
; CHECK-NEXT: add.nc lc, r0, #-5
; CHECK-NEXT: movxm ls, #.LBB0_2
; CHECK-NEXT: movxm le, #.L_LEnd0
; CHECK-NEXT: vldb wh0, [p0, #32]; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: nopa ; vldb wh0, [p0, #32]; nopx ; add.nc lc, r0, #-5
; CHECK-NEXT: vldb wl0, [p0], #64; movxm ls, #.LBB0_2
; CHECK-NEXT: vldb wh0, [p0, #32]; movxm le, #.L_LEnd0
; CHECK-NEXT: vldb wl0, [p0], #64; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: vldb wh0, [p0, #32]; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: vldb wl0, [p0], #64; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: vldb wh0, [p0, #32]; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: vldb wl0, [p0], #64; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: vldb wh0, [p0, #32]; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: vldb wl0, [p0], #64; nopx
; CHECK-NEXT: vldb wh0, [p0, #32]; vbneg_ltz.s16 x1, r21, x0
; CHECK-NEXT: vldb wl0, [p0], #64
; CHECK-NEXT: vldb wh0, [p0, #32]; nopa ; nops ; nopx ; vbneg_ltz.s16 x1, r21, x0; nopv
; CHECK-NEXT: vldb wl0, [p0], #64; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: .LBB0_1: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldb wh0, [p0, #32]; nopa ; vst wh1, [p1, #32]; nopx ; vbneg_ltz.s16 x1, r21, x0; nopv
Expand Down
Loading

0 comments on commit d4c8472

Please sign in to comment.