Skip to content

Commit

Permalink
[AIE] This is mainly Work In Progress adding a solver-based pipeliner
Browse files Browse the repository at this point in the history
The solver tries to find an (II,NStages) SWP schedule.
The variables represent the stage and modulo cycle inwhich each instruction
should run. From those we get linear expressions for the execution cycle,
which are used to generate linear constraints representing the dependencies
and their latencies.
Further constraints make sure every instruction is scheduled, and that only a
single instance of each slot and resource is used in every modulo cycle.

In practice, adding more constraints increases the runtime. In particular,
adding the conflicts for loop-carried dependences and memory bank conflicts
for conv2d_bf16-sized kernels has been seen to increase solver time to over
an hour, which is clearly not acceptable.

The solution is used to guide a regular postpipeliner strategy, which will
reject the solution if it violates constraints. This allows us to solve an
incompletely constrained problem and opportunistically apply it if it fits
  • Loading branch information
Martien de Jong committed Jan 10, 2025
1 parent fcfddf3 commit 754d949
Show file tree
Hide file tree
Showing 8 changed files with 743 additions and 1 deletion.
2 changes: 2 additions & 0 deletions clang/cmake/caches/Peano-AIE.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ if(LLVM_BUILD_LLVM_DYLIB)
list(APPEND _llvm_distribution_components LLVM clang-cpp)
endif()

option(LLVM_ENABLE_Z3_SOLVER "" ON)

# there's some bug here where if you list(APPEND ...) to a CACHE variable
# it doesn't work (neither libLLVM nor clang-cpp were being successfully installed)
set(LLVM_DISTRIBUTION_COMPONENTS ${_llvm_distribution_components} CACHE STRING "")
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,7 @@ SchedulingStage InterBlockScheduling::updateScheduling(BlockState &BS) {
// But first try SWP
if (BS.getRegions().size() == 1) {
auto &PostSWP = BS.getPostSWP();
PostSWP.setUseSolver(true);
if (PostSWP.canAccept(*BS.TheBlock)) {
BS.FixPoint.II = PostSWP.getResMII(*BS.TheBlock);
return BS.FixPoint.Stage = SchedulingStage::Pipelining;
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AIE/AIEMachineScheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1394,6 +1394,7 @@ void AIEScheduleDAGMI::schedule() {
BS.setPipelined();
LLVM_DEBUG(PostSWP.dump());
}
PostSWP.setUseSolver(false);
return;
}
default:
Expand Down
121 changes: 120 additions & 1 deletion llvm/lib/Target/AIE/AIEPostPipeliner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//

#include "AIEPostPipeliner.h"
#include "AIESWPSolver.h"
#include "AIESlotCounts.h"
#include "Utils/AIELoopUtils.h"
#include "llvm/CodeGen/ScheduleDAG.h"
Expand All @@ -23,6 +24,7 @@
#define DEBUG_FULL(X) DEBUG_WITH_TYPE("postpipeliner-full", X)

namespace llvm::AIE {
using namespace Solver;

static cl::opt<int>
Heuristic("aie-postpipeliner-heuristic",
Expand Down Expand Up @@ -59,6 +61,8 @@ class PostPipelineDumper : public PipelineScheduleVisitor {
PostPipeliner::PostPipeliner(const AIEHazardRecognizer &HR, int NInstr)
: HR(HR), NInstr(NInstr) {}

void PostPipeliner::setUseSolver(bool Value) { UseSolver = Value; }

bool PostPipeliner::canAccept(MachineBasicBlock &LoopBlock) {
// We leave the single-block loop criterion to our caller. It is fulfilled
// by being a loopaware scheduling candidate.
Expand Down Expand Up @@ -115,11 +119,15 @@ bool PostPipeliner::canAccept(MachineBasicBlock &LoopBlock) {
return true;
}

static SlotCounts getSlotCounts(MachineInstr &MI, const AIEBaseInstrInfo *TII) {
static uint64_t getSlotSet(MachineInstr &MI, const AIEBaseInstrInfo *TII) {
auto *SlotInfo = TII->getSlotInfo(TII->getSlotKind(MI.getOpcode()));
return SlotInfo ? SlotInfo->getSlotSet() : 0;
}

static SlotCounts getSlotCounts(MachineInstr &MI, const AIEBaseInstrInfo *TII) {
return SlotCounts{getSlotSet(MI, TII)};
}

int PostPipeliner::getResMII(MachineBasicBlock &LoopBlock) {
// Add up all slot requirements and return the maximum slot count
SlotCounts Counts;
Expand Down Expand Up @@ -668,6 +676,10 @@ bool PostPipeliner::tryHeuristics() {

DEBUG_SUMMARY(dbgs() << "-- MinLength=" << MinLength << "\n");

if (II == 16 && solve()) {
return true;
}

int HeuristicIndex = 0;
for (auto &[ExtraStages, TopDown, Rerun, Components] : Strategies) {
if (Heuristic >= 0 && Heuristic != HeuristicIndex++) {
Expand Down Expand Up @@ -702,6 +714,113 @@ bool PostPipeliner::tryHeuristics() {
return false;
}

// This is a strategy that follows a pre-computed schedule. it picks
// instructions in the order of the final schedule and nudges earliest and
// latest so as to have no slack.
// It still checks latencies and resources
class FixedStrategy : public PostPipelinerStrategy {
std::vector<int> Schedule;
// We schedule in strict top-down order, and we leave only one cycle
// to schedule it in.
bool better(const SUnit &A, const SUnit &B) override {
if (Schedule[A.NodeNum] < Schedule[B.NodeNum]) {
return true;
}
return false;
}
int earliest(const SUnit &N) override {
int Result = PostPipelinerStrategy::earliest(N);
unsigned NodeNum = N.NodeNum;
if (NodeNum < Schedule.size()) {
Result = std::max(Result, Schedule[NodeNum]);
}
return Result;
}
int latest(const SUnit &N) override {
int Result = PostPipelinerStrategy::latest(N);
unsigned NodeNum = N.NodeNum;
if (NodeNum < Schedule.size()) {
Result = std::min(Result, Schedule[NodeNum]);
}
return Result;
}

public:
FixedStrategy(ScheduleDAGInstrs &DAG, std::vector<NodeInfo> &Info, int Length,
std::vector<int> Schedule)
: PostPipelinerStrategy(DAG, Info, Length), Schedule(Schedule) {}
std::string name() override { return "FixedStrategy"; }
};

bool PostPipeliner::solve() {
if (!UseSolver) {
return false;
}

Z3Solver Solver;
Solver.setScheduleSize(II, 3);
for (int N = 0; N < NInstr; N++) {
SUnit &SU = DAG->SUnits[N];
auto *MI = SU.getInstr();
auto SlotSet = getSlotSet(*MI, TII);

// We assume we only have one slot bit
auto GetBit = [](uint64_t SlotSet) {
assert(SlotSet);
int SlotNo = 1;
while (!(SlotSet & 1)) {
SlotNo++;
SlotSet >>= 1;
}
assert(SlotSet == 1);
return SlotNo;
};
uint64_t MemoryBanks = HR.getMemoryBanks(MI);
unsigned Id = Solver.addInsn(GetBit(SlotSet), MemoryBanks);
assert(Id == SU.NodeNum);
for (auto Dep : SU.Preds) {
int From = Dep.getSUnit()->NodeNum;
if (From < NInstr) {
Solver.addLatency(From, N, Dep.getSignedLatency());
}
}
}

// Add loop-carried true dependences to future iterations. The iteration
// distance is taken into account
for (int N = 0; N < NInstr; N++) {
SUnit &SU = DAG->SUnits[N];
for (auto Dep : SU.Succs) {
if (Dep.getKind() != SDep::Data) {
// continue;
}
int To = Dep.getSUnit()->NodeNum;
if (To >= NInstr && To % NInstr != N) {
Solver.addLatency(N, To % NInstr, Dep.getSignedLatency(), To / NInstr);
}
}
}

if (!Solver.genModel()) {
return false;
}
auto Schedule = Solver.getCycles();
DEBUG_SUMMARY(dbgs() << "Solver found "; for (auto C
: Schedule) dbgs()
<< C << ", ";
dbgs() << "\n";);
FixedStrategy S{*DAG, Info, II * 3, Schedule};
resetSchedule(/*FullReset=*/true);
DEBUG_SUMMARY(dbgs() << "--- Strategy " << S.name() << "\n");
if (scheduleFirstIteration(S) && scheduleOtherIterations()) {
DEBUG_SUMMARY(dbgs() << " Strategy " << S.name() << " found II=" << II
<< "\n");
return true;
}

return false;
}

bool PostPipeliner::schedule(ScheduleDAGMI &TheDAG, int InitiationInterval) {
NTotalInstrs = TheDAG.SUnits.size();
assert(NTotalInstrs % NInstr == 0);
Expand Down
9 changes: 9 additions & 0 deletions llvm/lib/Target/AIE/AIEPostPipeliner.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,8 @@ class PostPipeliner {
// The instruction defining the tripcount
MachineInstr *TripCountDef = nullptr;

bool UseSolver = false;

// Basic modulo scheduling parameters
int NInstr;
int NCopies;
Expand Down Expand Up @@ -187,6 +189,9 @@ class PostPipeliner {
// this length will be a multiple of the InitiationInterval
int computeMinScheduleLength() const;

// try to find a solution using a solver
bool solve();

/// Try all heuristics, stop at the first that fits the II
/// If it returns true, a valid schedule is laid down in Info.
bool tryHeuristics();
Expand All @@ -210,6 +215,10 @@ class PostPipeliner {
public:
PostPipeliner(const AIEHazardRecognizer &HR, int NInstr);

// Specify whether to use a solver. Maybe for -O3, or pragma driven.
// Default is off
void setUseSolver(bool Value);

/// Check whether this is a suitable loop for the PostPipeliner. It also
/// leaves some useful information.
bool canAccept(MachineBasicBlock &LoopBlock);
Expand Down
Loading

0 comments on commit 754d949

Please sign in to comment.