From 6ba67b9f8ec5d9f5c62e16fddba3647f4855b202 Mon Sep 17 00:00:00 2001 From: Martien de Jong Date: Fri, 10 Jan 2025 10:28:31 +0100 Subject: [PATCH 1/4] [AIE] This is mainly Work In Progress adding a solver-based pipeliner The solver tries to find an (II,NStages) SWP schedule. The variables represent the stage and modulo cycle inwhich each instruction should run. From those we get linear expressions for the execution cycle, which are used to generate linear constraints representing the dependencies and their latencies. Further constraints make sure every instruction is scheduled, and that only a single instance of each slot and resource is used in every modulo cycle. In practice, adding more constraints increases the runtime. In particular, adding the conflicts for loop-carried dependences and memory bank conflicts for conv2d_bf16-sized kernels has been seen to increase solver time to over an hour, which is clearly not acceptable. The solution is used to guide a regular postpipeliner strategy, which will reject the solution if it violates constraints. This allows us to solve an incompletely constrained problem and opportunistically apply it if it fits --- clang/cmake/caches/Peano-AIE.cmake | 2 + .../Target/AIE/AIEInterBlockScheduling.cpp | 1 + llvm/lib/Target/AIE/AIEMachineScheduler.cpp | 1 + llvm/lib/Target/AIE/AIEPostPipeliner.cpp | 121 ++++- llvm/lib/Target/AIE/AIEPostPipeliner.h | 9 + llvm/lib/Target/AIE/AIESWPSolver.cpp | 434 ++++++++++++++++++ llvm/lib/Target/AIE/AIESWPSolver.h | 175 +++++++ llvm/lib/Target/AIE/CMakeLists.txt | 1 + 8 files changed, 743 insertions(+), 1 deletion(-) create mode 100644 llvm/lib/Target/AIE/AIESWPSolver.cpp create mode 100644 llvm/lib/Target/AIE/AIESWPSolver.h diff --git a/clang/cmake/caches/Peano-AIE.cmake b/clang/cmake/caches/Peano-AIE.cmake index 9c0d08cb0e03..99e8b918980a 100644 --- a/clang/cmake/caches/Peano-AIE.cmake +++ b/clang/cmake/caches/Peano-AIE.cmake @@ -61,6 +61,8 @@ if(LLVM_BUILD_LLVM_DYLIB) list(APPEND _llvm_distribution_components LLVM clang-cpp) endif() +option(LLVM_ENABLE_Z3_SOLVER "" ON) + # there's some bug here where if you list(APPEND ...) to a CACHE variable # it doesn't work (neither libLLVM nor clang-cpp were being successfully installed) set(LLVM_DISTRIBUTION_COMPONENTS ${_llvm_distribution_components} CACHE STRING "") diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp index 024ecbd09b48..bb75f3f356a9 100644 --- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp +++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp @@ -598,6 +598,7 @@ SchedulingStage InterBlockScheduling::updateScheduling(BlockState &BS) { // But first try SWP if (BS.getRegions().size() == 1) { auto &PostSWP = BS.getPostSWP(); + PostSWP.setUseSolver(true); if (PostSWP.canAccept(*BS.TheBlock)) { BS.FixPoint.II = PostSWP.getResMII(*BS.TheBlock); return BS.FixPoint.Stage = SchedulingStage::Pipelining; diff --git a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp index 57fa9ed2ab52..36a22b6544ea 100644 --- a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp +++ b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp @@ -1394,6 +1394,7 @@ void AIEScheduleDAGMI::schedule() { BS.setPipelined(); LLVM_DEBUG(PostSWP.dump()); } + PostSWP.setUseSolver(false); return; } default: diff --git a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp index 48f194d0878b..5436a0c0b5e0 100644 --- a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp +++ b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "AIEPostPipeliner.h" +#include "AIESWPSolver.h" #include "AIESlotCounts.h" #include "Utils/AIELoopUtils.h" #include "llvm/CodeGen/ScheduleDAG.h" @@ -23,6 +24,7 @@ #define DEBUG_FULL(X) DEBUG_WITH_TYPE("postpipeliner-full", X) namespace llvm::AIE { +using namespace Solver; static cl::opt Heuristic("aie-postpipeliner-heuristic", @@ -59,6 +61,8 @@ class PostPipelineDumper : public PipelineScheduleVisitor { PostPipeliner::PostPipeliner(const AIEHazardRecognizer &HR, int NInstr) : HR(HR), NInstr(NInstr) {} +void PostPipeliner::setUseSolver(bool Value) { UseSolver = Value; } + bool PostPipeliner::canAccept(MachineBasicBlock &LoopBlock) { // We leave the single-block loop criterion to our caller. It is fulfilled // by being a loopaware scheduling candidate. @@ -115,11 +119,15 @@ bool PostPipeliner::canAccept(MachineBasicBlock &LoopBlock) { return true; } -static SlotCounts getSlotCounts(MachineInstr &MI, const AIEBaseInstrInfo *TII) { +static uint64_t getSlotSet(MachineInstr &MI, const AIEBaseInstrInfo *TII) { auto *SlotInfo = TII->getSlotInfo(TII->getSlotKind(MI.getOpcode())); return SlotInfo ? SlotInfo->getSlotSet() : 0; } +static SlotCounts getSlotCounts(MachineInstr &MI, const AIEBaseInstrInfo *TII) { + return SlotCounts{getSlotSet(MI, TII)}; +} + int PostPipeliner::getResMII(MachineBasicBlock &LoopBlock) { // Add up all slot requirements and return the maximum slot count SlotCounts Counts; @@ -668,6 +676,10 @@ bool PostPipeliner::tryHeuristics() { DEBUG_SUMMARY(dbgs() << "-- MinLength=" << MinLength << "\n"); + if (II == 16 && solve()) { + return true; + } + int HeuristicIndex = 0; for (auto &[ExtraStages, TopDown, Rerun, Components] : Strategies) { if (Heuristic >= 0 && Heuristic != HeuristicIndex++) { @@ -702,6 +714,113 @@ bool PostPipeliner::tryHeuristics() { return false; } +// This is a strategy that follows a pre-computed schedule. it picks +// instructions in the order of the final schedule and nudges earliest and +// latest so as to have no slack. +// It still checks latencies and resources +class FixedStrategy : public PostPipelinerStrategy { + std::vector Schedule; + // We schedule in strict top-down order, and we leave only one cycle + // to schedule it in. + bool better(const SUnit &A, const SUnit &B) override { + if (Schedule[A.NodeNum] < Schedule[B.NodeNum]) { + return true; + } + return false; + } + int earliest(const SUnit &N) override { + int Result = PostPipelinerStrategy::earliest(N); + unsigned NodeNum = N.NodeNum; + if (NodeNum < Schedule.size()) { + Result = std::max(Result, Schedule[NodeNum]); + } + return Result; + } + int latest(const SUnit &N) override { + int Result = PostPipelinerStrategy::latest(N); + unsigned NodeNum = N.NodeNum; + if (NodeNum < Schedule.size()) { + Result = std::min(Result, Schedule[NodeNum]); + } + return Result; + } + +public: + FixedStrategy(ScheduleDAGInstrs &DAG, std::vector &Info, int Length, + std::vector Schedule) + : PostPipelinerStrategy(DAG, Info, Length), Schedule(Schedule) {} + std::string name() override { return "FixedStrategy"; } +}; + +bool PostPipeliner::solve() { + if (!UseSolver) { + return false; + } + + Z3Solver Solver; + Solver.setScheduleSize(II, 3); + for (int N = 0; N < NInstr; N++) { + SUnit &SU = DAG->SUnits[N]; + auto *MI = SU.getInstr(); + auto SlotSet = getSlotSet(*MI, TII); + + // We assume we only have one slot bit + auto GetBit = [](uint64_t SlotSet) { + assert(SlotSet); + int SlotNo = 1; + while (!(SlotSet & 1)) { + SlotNo++; + SlotSet >>= 1; + } + assert(SlotSet == 1); + return SlotNo; + }; + uint64_t MemoryBanks = HR.getMemoryBanks(MI); + unsigned Id = Solver.addInsn(GetBit(SlotSet), MemoryBanks); + assert(Id == SU.NodeNum); + for (auto Dep : SU.Preds) { + int From = Dep.getSUnit()->NodeNum; + if (From < NInstr) { + Solver.addLatency(From, N, Dep.getSignedLatency()); + } + } + } + + // Add loop-carried true dependences to future iterations. The iteration + // distance is taken into account + for (int N = 0; N < NInstr; N++) { + SUnit &SU = DAG->SUnits[N]; + for (auto Dep : SU.Succs) { + if (Dep.getKind() != SDep::Data) { + // continue; + } + int To = Dep.getSUnit()->NodeNum; + if (To >= NInstr && To % NInstr != N) { + Solver.addLatency(N, To % NInstr, Dep.getSignedLatency(), To / NInstr); + } + } + } + + if (!Solver.genModel()) { + return false; + } + auto Schedule = Solver.getCycles(); + DEBUG_SUMMARY(dbgs() << "Solver found "; for (auto C + : Schedule) dbgs() + << C << ", "; + dbgs() << "\n";); + FixedStrategy S{*DAG, Info, II * 3, Schedule}; + resetSchedule(/*FullReset=*/true); + DEBUG_SUMMARY(dbgs() << "--- Strategy " << S.name() << "\n"); + if (scheduleFirstIteration(S) && scheduleOtherIterations()) { + DEBUG_SUMMARY(dbgs() << " Strategy " << S.name() << " found II=" << II + << "\n"); + return true; + } + + return false; +} + bool PostPipeliner::schedule(ScheduleDAGMI &TheDAG, int InitiationInterval) { NTotalInstrs = TheDAG.SUnits.size(); assert(NTotalInstrs % NInstr == 0); diff --git a/llvm/lib/Target/AIE/AIEPostPipeliner.h b/llvm/lib/Target/AIE/AIEPostPipeliner.h index 5fa8ca8d7f49..92eaef899934 100644 --- a/llvm/lib/Target/AIE/AIEPostPipeliner.h +++ b/llvm/lib/Target/AIE/AIEPostPipeliner.h @@ -156,6 +156,8 @@ class PostPipeliner { // The instruction defining the tripcount MachineInstr *TripCountDef = nullptr; + bool UseSolver = false; + // Basic modulo scheduling parameters int NInstr; int NCopies; @@ -187,6 +189,9 @@ class PostPipeliner { // this length will be a multiple of the InitiationInterval int computeMinScheduleLength() const; + // try to find a solution using a solver + bool solve(); + /// Try all heuristics, stop at the first that fits the II /// If it returns true, a valid schedule is laid down in Info. bool tryHeuristics(); @@ -210,6 +215,10 @@ class PostPipeliner { public: PostPipeliner(const AIEHazardRecognizer &HR, int NInstr); + // Specify whether to use a solver. Maybe for -O3, or pragma driven. + // Default is off + void setUseSolver(bool Value); + /// Check whether this is a suitable loop for the PostPipeliner. It also /// leaves some useful information. bool canAccept(MachineBasicBlock &LoopBlock); diff --git a/llvm/lib/Target/AIE/AIESWPSolver.cpp b/llvm/lib/Target/AIE/AIESWPSolver.cpp new file mode 100644 index 000000000000..e8a3c2dd8167 --- /dev/null +++ b/llvm/lib/Target/AIE/AIESWPSolver.cpp @@ -0,0 +1,434 @@ +//===- AIESWPSolver.cpp - Solver infrastructure ===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// +// This file contains an interface to create constraints to model a software +// pipelining problem. +//===----------------------------------------------------------------------===// + +#include "AIESWPSolver.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" + +#define DEBUG_TYPE "swpilp" + +namespace llvm::AIE::Solver { + +SWPSolver::~SWPSolver() {} + +Slot &SWPSolver::addSlot(int N) { + auto It = Slots.emplace(N, Slot(N)).first; + return It->second; +} +int SWPSolver::addInsn(int SlotNumber, uint64_t MemoryBanks) { + Slot *Slot = &addSlot(SlotNumber); + int NextInsn = Instructions.size(); + Instructions.emplace(NextInsn, Instruction{Slot, MemoryBanks}); + Slot->Instructions.insert(NextInsn); + return NextInsn; +} + +int SWPSolver::getDepth(int I) { return Instructions.at(I).Depth; } + +int SWPSolver::getHeight(int I) { return Instructions.at(I).Height; } +void SWPSolver::setDepth(int I, int D) { + auto &Insn = Instructions.at(I); + + int NewDepth = std::max(Insn.Depth, D); + + // fprintf (stderr, "Depth(%d) -> %d\n", I, NewDepth); + Insn.Depth = NewDepth; +} + +void SWPSolver::setHeight(int I, int H) { + auto &Insn = Instructions.at(I); + + int NewHeight = std::max(Insn.Height, H); + + // fprintf (stderr, "Height(%d) -> %d\n", I, NewHeight); + Insn.Height = NewHeight; +} + +bool SWPSolver::feasible(int I, int S, int C) { + int Cycle = S * II + C; + int Length = NumStages * II; + return Cycle >= getDepth(I) && Cycle < Length - getHeight(I); +} + +void SWPSolver::addLatency(int Src, int Dst, int Latency, int Distance) { + Latencies.emplace_back(Src, Dst, Latency, Distance); + if (Distance) { + return; + } + + // ad hoc constraint: push Depth of Dst up + // and push Height of Src up + // fprintf(stderr, "%d pushes\n", Src); + setDepth(Dst, getDepth(Src) + Latency); + setHeight(Src, getHeight(Dst) + Latency); +} + +void SWPSolver::setScheduleSize(int I, int S) { + II = I; + NumStages = S; +} + +void SWPSolver::slots() { + // For each slot, for each cycle, constrain to at most one use + for (auto &[SlotNo, Slot] : Slots) { + if (Slot.Instructions.empty()) { + continue; + } + + if (!genSlotConstraint(SlotNo, Slot)) { + return; + } + } +} +void SWPSolver::conflicts() { + for (auto &[M, I] : Instructions) { + if (!I.MemoryBanks) { + continue; + } + for (auto &[N, J] : Instructions) { + // Mutual exclusion is symmetric, there's only a conflict + // if there's an overlap in the memory banks and the conflict + // constraint is redundant if the slots already collide + if (N <= M || (I.MemoryBanks & J.MemoryBanks) == 0 || + I.TheSlot == J.TheSlot) { + continue; + } + + LLVM_DEBUG(dbgs() << "Bank conflict(" << M << ", " << N << ")\n"); + genConflict(M, N); + } + } +} + +void SWPSolver::latencies() { + // Just sorting the latency constraints like this makes z3 run >2x faster. + std::sort(Latencies.begin(), Latencies.end(), + [&](const Latency &A, const Latency &B) { + if (A.Src != B.Src) { + return A.Src < B.Src; + } + if (A.Dst != B.Dst) { + return A.Dst < B.Dst; + } + return (A.Lat - A.Dist * II) > (B.Lat - B.Dist * II); + }); + + // The latencies between a given pair of nodes are now adjacent + // and descending, and we have at least one. We only need to + // generate a constraint for the first, largest, latency for a given pair. + std::optional Prev; + for (const auto &L : Latencies) { + if (Prev && L.Src == Prev->Src && L.Dst == Prev->Dst) { + dbgs() << "Skip latency constraint\n"; + continue; + } + if (!genLatencyConstraint(L)) { + return; + } + Prev = L; + } +} + +std::string Z3Solver::varname(int I, int S, int C) { + std::string Name = "I" + std::to_string(I) + "_" + std::to_string(S) + "_" + + std::to_string(C); + return Name; +} +std::string simpleName(const char *Prefix, int N) { + std::string Name = Prefix + std::to_string(N); + return Name; +} + +std::optional Z3Solver::vardecl(int I, int S, int C) { + return VarDecls[(I * NumStages + S) * II + C]; +} + +Z3Solver::Z3Solver() : Solver(Context), Zero(Context.int_val(0)) {} + +void Z3Solver::vars() { + // Create the scheduled vars + for (auto &[N, I] : Instructions) { + for (int S = 0; S < NumStages; S++) { + for (int C = 0; C < II; C++) { + std::optional OptVar; + if (feasible(N, S, C)) { + OptVar = Context.bool_const(varname(N, S, C).c_str()); + } + VarDecls.push_back(OptVar); + } + } + } +} +void Z3Solver::scheduled() { + for (auto &[N, I] : Instructions) { + z3::expr_vector Elements(Context); + for (int S = 0; S < NumStages; S++) { + for (int C = 0; C < II; C++) { + addVar(N, S, C, Elements); + } + } + Solver.add(z3::atmost(Elements, 1)); + Solver.add(z3::atleast(Elements, 1)); + } +} + +void Z3Solver::addVar(int N, int S, int C, z3::expr_vector &Elements) { + std::optional Var = vardecl(N, S, C); + if (Var) { + Elements.push_back(*Var); + } +} + +z3::expr Z3Solver::genCycle(int I) { + z3::expr_vector Elements(Context); + for (int S = 0; S < NumStages; S++) { + for (int C = 0; C < II; C++) { + std::optional Var = vardecl(I, S, C); + if (!Var) { + continue; + } + int LinearCycle = S * II + C; + if (!LinearCycle) { + continue; + } + z3::expr Factor = Context.int_val(LinearCycle); + Elements.push_back(z3::ite(*Var, Factor, Zero)); + } + } + return z3::sum(Elements); +} + +void Z3Solver::cycles() { + for (auto &[N, I] : Instructions) { + CycleExprs.push_back(genCycle(N)); + } +} + +bool Z3Solver::genLatencyConstraint(const Latency &L) { + std::cout << "Add " << L.Src << " --> " << L.Dst << " L=" << L.Lat + << " D=" << L.Dist << "\n"; + z3::expr Distance = Context.int_val(L.Lat - L.Dist * II); + Solver.add((CycleExprs[L.Dst] - CycleExprs[L.Src]) >= Distance); +#if 0 + if (Solver.check() != z3::sat) { + std::cout << " FAILED\n"; + return false; + } +#endif + return true; +} + +bool Z3Solver::genSlotConstraint(int SlotNo, const Slot &Slot) { + for (int C = 0; C < II; C++) { + z3::expr_vector Elements(Context); + for (int I : Slot.Instructions) { + for (int S = 0; S < NumStages; S++) { + addVar(I, S, C, Elements); + } + } + Solver.add(z3::atmost(Elements, 1)); + } + return true; +} + +void Z3Solver::genConflict(int M, int N) { + z3::expr_vector Elements(Context); + // All stages have a contribution to a particular cycle) + for (int S = 0; S < NumStages; S++) { + for (int C = 0; C < II; C++) { + addVar(M, S, C, Elements); + addVar(N, S, C, Elements); + } + } + Solver.add(z3::atmost(Elements, 1)); +} + +bool Z3Solver::genModel() { + vars(); + slots(); + scheduled(); + cycles(); + latencies(); + conflicts(); + + LLVM_DEBUG(dbgs() << "II=" << II << " NS=" << NumStages << "\n"); + switch (Solver.check()) { + case z3::unsat: + std::cout << "Unsatisfiable\n"; + return false; + + case z3::unknown: + std::cout << "Unknown\n"; + return false; + default: + return true; + } +} + +std::vector Z3Solver::getCycles() { + z3::model M = Solver.get_model(); + + std::cout << "Satisfied by:\n"; + // All the information is in the variables that are true + for (auto &V : VarDecls) { + if (V && M.eval(*V).is_true()) { + std::cout << V->decl().name() << "\n"; + } + } + std::vector Cycles; + for (auto &C : CycleExprs) { + auto Val = M.eval(C); + int IntVal; + Z3_get_numeral_int(Context, Val, &IntVal); + Cycles.push_back(IntVal); + } + return Cycles; +} + +void Z3IntegerSolver::vars() { + // Create the stage and cycle vars + for (auto &[N, I] : Instructions) { + z3::expr SV = Context.int_const(simpleName("S", N).c_str()); + StageVarDecls.push_back(SV); + z3::expr CV = Context.int_const(simpleName("C", N).c_str()); + CycleVarDecls.push_back(CV); + } +} + +void Z3IntegerSolver::scheduled() { + // Constrain the variables to fit in the target schedule size + for (auto &[N, I] : Instructions) { + z3::expr &SV = StageVarDecls[N]; + Solver.add(SV >= 0); + Solver.add(SV < NumStages); + z3::expr &CV = CycleVarDecls[N]; + Solver.add(CV >= 0); + Solver.add(CV < II); + } +} + +z3::expr Z3IntegerSolver::genCycle(int N) { + return StageVarDecls[N] * II + CycleVarDecls[N]; +} + +void Z3IntegerSolver::genConflict(int M, int N) { + Solver.add(CycleVarDecls[M] != CycleVarDecls[N]); +} + +bool Z3IntegerSolver::genSlotConstraint(int SlotNo, const Slot &Slot) { + for (int I : Slot.Instructions) { + for (int J : Slot.Instructions) { + if (J < I) { + Solver.add(CycleVarDecls[I] != CycleVarDecls[J]); + } + } + } + + return true; +} + +bool LPFile::genModel() { + printf("max: ;\n"); + scheduled(); + conflicts(); + latencies(); + vars(); + + // Can not extract results without calling external solver. + return false; +} + +// Each instruction needs to be scheduled in some cycle in some stage. +void LPFile::scheduled() { + for (auto &[I, Insn] : Instructions) { + printf("// scheduled I%d constraint\n", I); + const char *Plus = ""; + for (int S = 0; S < NumStages; S++) { + for (int C = 0; C < II; C++) { + if (!feasible(I, S, C)) { + continue; + } + printf(VarFmt, Plus, Insn.TheSlot->SlotNumber, I, S, C); + Plus = " + "; + } + } + printf(" = 1;\n"); + } +} + +// All variables need to be declared +void LPFile::vars() { + for (auto &[I, Insn] : Instructions) { + printf("// scheduled I%d vars\n", I); + const char *Comma = ""; + printf("sos "); + for (int S = 0; S < NumStages; S++) { + for (int C = 0; C < II; C++) { + if (!feasible(I, S, C)) { + continue; + } + printf(VarFmt, Comma, Insn.TheSlot->SlotNumber, I, S, C); + Comma = ", "; + } + } + printf(" <= 1;\n"); + } +} + +// Generate the expression for the (linear) cycle of an instruction +void LPFile::genCycle(int I, int Sign) { + for (int S = 0; S < NumStages; S++) { + for (int C = 0; C < II; C++) { + if (!feasible(I, S, C)) { + continue; + } + int Factor = Sign * (S * II + C); + if (!Factor) { + continue; + } + printf(" %+d ", Factor); + printf(VarFmt, "", Instructions.at(I).TheSlot->SlotNumber, I, S, C); + } + } +} + +// Only one occurrence of each slot in each cycle +bool LPFile::genSlotConstraint(int SlotNo, const Slot &Slot) { + for (int C = 0; C < II; C++) { + printf("\n// Slot %d, Cycle %d\n", SlotNo, C); + const char *Plus = ""; + for (int I : Slot.Instructions) { + for (int S = 0; S < NumStages; S++) { + printf(VarFmt, Plus, SlotNo, I, S, C); + Plus = " + "; + } + } + printf(" <= 1;\n"); + } + return true; +} + +bool LPFile::genLatencyConstraint(const Latency &L) { + printf("// Lat(%d -> %d) = %d (%d)\n", L.Src, L.Dst, L.Lat, L.Dist); + genCycle(L.Dst, 1); + genCycle(L.Src, -1); + printf(" >= %d;\n", L.Lat - L.Dist * II); + return true; +} + +void LPFile::genConflict(int M, int N) { + llvm_unreachable("LPFile should implement genConflict"); +} + +} // namespace llvm::AIE::Solver diff --git a/llvm/lib/Target/AIE/AIESWPSolver.h b/llvm/lib/Target/AIE/AIESWPSolver.h new file mode 100644 index 000000000000..26bd12e8eb01 --- /dev/null +++ b/llvm/lib/Target/AIE/AIESWPSolver.h @@ -0,0 +1,175 @@ +//===- AIESWPSolver.h - Software Pipeliner Solver infrastructure-----------===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AIE_AIESWPSOLVER_H +#define LLVM_LIB_TARGET_AIE_AIESWPSOLVER_H + +#include "z3++.h" +#include +#include +#include +#include +#include + +namespace llvm::AIE::Solver { +class Slot { +public: + int SlotNumber; + Slot(int N) : SlotNumber(N) {} + std::set Instructions; +}; + +class Latency { +public: + int Src; + int Dst; + int Lat; + int Dist; + Latency(int S, int D, int L, int Dist) : Src(S), Dst(D), Lat(L), Dist(Dist) {} +}; + +class Instruction { +public: + int Depth = 0; + int Height = 0; + Slot *TheSlot; + uint64_t MemoryBanks = 0; + Instruction(Slot *S, uint64_t MemoryBanks) + : TheSlot(S), MemoryBanks(MemoryBanks) {} +}; + +class SWPSolver { +protected: + int II = 1; + int NumStages = 1; + int NSlots; + std::map Slots; + std::map Instructions; + std::vector Latencies; + + // Add a slot to the problem + Slot &addSlot(int N); + // Generate the latency constraints + void latencies(); + // Generate the slot constraints + void slots(); + // Generate further instruction conflict constraints + void conflicts(); + + int getDepth(int I); + int getHeight(int I); + void setDepth(int I, int D); + void setHeight(int I, int H); + + // Check whether a variable can be non-zero due to depth or height of the + // associated instruction + bool feasible(int I, int S, int C); + + virtual bool genModel() = 0; + // Generate the slot constraint for the given slot + virtual bool genSlotConstraint(int SlotNo, const Slot &Slot) = 0; + // Generate a constraint that represents a dependence latency + virtual bool genLatencyConstraint(const Latency &L) = 0; + // Generate a mutual exclusion constraint for instructions M and N in any + // cycle + virtual void genConflict(int I, int J) = 0; + + // Return the vector of instruction cycles + // \pre genModel() has returned true + virtual std::vector getCycles() { return {}; }; + +public: + virtual ~SWPSolver(); + // Add an instruction to the problem. It returns a unique Id + int addInsn(int Slot, uint64_t MemoryBanks); + // Add a latency between two instructions to the problem. + // Distance represents the iteration distance, i.e. the number of + // cfg backedges it spans. + void addLatency(int Src, int Dst, int Latency, int Distance = 0); + // Set the desired schedule size in terms of II and number of stages + void setScheduleSize(int II, int NS); +}; + +// In the binary formulation, we have a lot of binary variables, +// but relatively few, though elaborate constraints. +class Z3Solver : public SWPSolver { + std::vector> VarDecls; + +protected: + z3::context Context; + z3::solver Solver; + z3::expr Zero; + + // expression for the cycle of each instruction. + std::vector CycleExprs; + + std::string varname(int N, int S, int C); + std::optional vardecl(int N, int S, int C); + + // generate an expression that represents I running in modulo cycle C + z3::expr genModuloCycle(int I, int C); + + // Return a z3 expression that represents the cycle of an instruction + // in the linear schedule. + virtual z3::expr genCycle(int I); + + // If it exists, add a schedule variable declaration to Elements + void addVar(int N, int S, int C, z3::expr_vector &Elements); + + // generate the variables for each instruction + virtual void vars(); + // generate the constraint that all instructions must be scheduled + virtual void scheduled(); + // generate expressions for the cycle of each instruction + virtual void cycles(); + // generate the constraint that only one instance of Slot is present in + // cycle C + bool genSlotConstraint(int SlotNo, const Slot &Slot) override; + bool genLatencyConstraint(const Latency &L) override; + void genConflict(int M, int N) override; + +public: + Z3Solver(); + bool genModel() override; + std::vector getCycles() override; +}; + +// In the integer formulation, we have an integer stage and cycle variable +// for each instruction. +// Constraints will be pretty compact, but we will have many of them, +// roughly quadratic in the number of instructions +// FIXME: subclass current Z3Solver as Z3BinarySolver +class Z3IntegerSolver : public Z3Solver { + std::vector StageVarDecls; + std::vector CycleVarDecls; + void vars() override; + void scheduled() override; + z3::expr genCycle(int N) override; + void genConflict(int M, int N) override; + bool genSlotConstraint(int SlotNo, const Slot &Slot) override; +}; + +class LPFile : public SWPSolver { + // Prefix, SlotNumber, InstrNr, Stage, Cycle + const char *const VarFmt = "%sV%d_%d_%d_%d"; + +public: + bool genModel() override; + void scheduled(); + void vars(); + void genCycle(int I, int Sign); + bool genSlotConstraint(int SlotNo, const Slot &Slot) override; + bool genLatencyConstraint(const Latency &L) override; + void genConflict(int M, int N) override; +}; + +} // namespace llvm::AIE::Solver + +#endif // LLVM_LIB_TARGET_AIE_AIESWPSOLVER_H diff --git a/llvm/lib/Target/AIE/CMakeLists.txt b/llvm/lib/Target/AIE/CMakeLists.txt index c914474b4990..4e4aaca0ebf7 100644 --- a/llvm/lib/Target/AIE/CMakeLists.txt +++ b/llvm/lib/Target/AIE/CMakeLists.txt @@ -122,6 +122,7 @@ add_llvm_target(AIECodeGen AIESplitInstructionRewriter.cpp AIESubRegConstrainer.cpp AIESubtarget.cpp + AIESWPSolver.cpp AIESuperRegRewriter.cpp AIETargetMachine.cpp AIETargetObjectFile.cpp From 2dd0bd72660fe12e6e69441ddb41fff319f3984c Mon Sep 17 00:00:00 2001 From: Martien de Jong Date: Tue, 14 Jan 2025 16:03:41 +0100 Subject: [PATCH 2/4] refactor into binary solver and integer solver --- llvm/lib/Target/AIE/AIEPostPipeliner.cpp | 4 +- llvm/lib/Target/AIE/AIESWPSolver.cpp | 100 +++++++++++------------ llvm/lib/Target/AIE/AIESWPSolver.h | 58 +++++++------ 3 files changed, 84 insertions(+), 78 deletions(-) diff --git a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp index 5436a0c0b5e0..24b48c48d6a1 100644 --- a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp +++ b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp @@ -676,7 +676,7 @@ bool PostPipeliner::tryHeuristics() { DEBUG_SUMMARY(dbgs() << "-- MinLength=" << MinLength << "\n"); - if (II == 16 && solve()) { + if (II <= 10 && solve()) { return true; } @@ -757,7 +757,7 @@ bool PostPipeliner::solve() { return false; } - Z3Solver Solver; + Z3BinarySolver Solver; Solver.setScheduleSize(II, 3); for (int N = 0; N < NInstr; N++) { SUnit &SU = DAG->SUnits[N]; diff --git a/llvm/lib/Target/AIE/AIESWPSolver.cpp b/llvm/lib/Target/AIE/AIESWPSolver.cpp index e8a3c2dd8167..fd79c8716c7e 100644 --- a/llvm/lib/Target/AIE/AIESWPSolver.cpp +++ b/llvm/lib/Target/AIE/AIESWPSolver.cpp @@ -140,7 +140,31 @@ void SWPSolver::latencies() { } } -std::string Z3Solver::varname(int I, int S, int C) { +Z3Solver::Z3Solver() : Solver(Context), Zero(Context.int_val(0)) {} + +bool Z3Solver::genModel() { + vars(); + slots(); + scheduled(); + cycles(); + latencies(); + conflicts(); + + LLVM_DEBUG(dbgs() << "II=" << II << " NS=" << NumStages << "\n"); + switch (Solver.check()) { + case z3::unsat: + LLVM_DEBUG(dbgs() << "Unsatisfiable\n"); + return false; + case z3::unknown: + LLVM_DEBUG(dbgs() << "Unknown\n"); + return false; + default: + LLVM_DEBUG(dbgs() << "Solved\n"); + return true; + } +} + +std::string Z3BinarySolver::varname(int I, int S, int C) { std::string Name = "I" + std::to_string(I) + "_" + std::to_string(S) + "_" + std::to_string(C); return Name; @@ -150,13 +174,11 @@ std::string simpleName(const char *Prefix, int N) { return Name; } -std::optional Z3Solver::vardecl(int I, int S, int C) { +std::optional Z3BinarySolver::vardecl(int I, int S, int C) { return VarDecls[(I * NumStages + S) * II + C]; } -Z3Solver::Z3Solver() : Solver(Context), Zero(Context.int_val(0)) {} - -void Z3Solver::vars() { +void Z3BinarySolver::vars() { // Create the scheduled vars for (auto &[N, I] : Instructions) { for (int S = 0; S < NumStages; S++) { @@ -170,7 +192,7 @@ void Z3Solver::vars() { } } } -void Z3Solver::scheduled() { +void Z3BinarySolver::scheduled() { for (auto &[N, I] : Instructions) { z3::expr_vector Elements(Context); for (int S = 0; S < NumStages; S++) { @@ -183,14 +205,14 @@ void Z3Solver::scheduled() { } } -void Z3Solver::addVar(int N, int S, int C, z3::expr_vector &Elements) { +void Z3BinarySolver::addVar(int N, int S, int C, z3::expr_vector &Elements) { std::optional Var = vardecl(N, S, C); if (Var) { Elements.push_back(*Var); } } -z3::expr Z3Solver::genCycle(int I) { +z3::expr Z3BinarySolver::genCycle(int I) { z3::expr_vector Elements(Context); for (int S = 0; S < NumStages; S++) { for (int C = 0; C < II; C++) { @@ -215,9 +237,21 @@ void Z3Solver::cycles() { } } -bool Z3Solver::genLatencyConstraint(const Latency &L) { - std::cout << "Add " << L.Src << " --> " << L.Dst << " L=" << L.Lat - << " D=" << L.Dist << "\n"; +std::vector Z3Solver::getCycles() { + z3::model M = Solver.get_model(); + std::vector Cycles; + for (auto &C : CycleExprs) { + auto Val = M.eval(C); + int IntVal; + Z3_get_numeral_int(Context, Val, &IntVal); + Cycles.push_back(IntVal); + } + return Cycles; +} + +bool Z3BinarySolver::genLatencyConstraint(const Latency &L) { + LLVM_DEBUG(dbgs() << "Add " << L.Src << " --> " << L.Dst << " L=" << L.Lat + << " D=" << L.Dist << "\n"); z3::expr Distance = Context.int_val(L.Lat - L.Dist * II); Solver.add((CycleExprs[L.Dst] - CycleExprs[L.Src]) >= Distance); #if 0 @@ -229,7 +263,7 @@ bool Z3Solver::genLatencyConstraint(const Latency &L) { return true; } -bool Z3Solver::genSlotConstraint(int SlotNo, const Slot &Slot) { +bool Z3BinarySolver::genSlotConstraint(int SlotNo, const Slot &Slot) { for (int C = 0; C < II; C++) { z3::expr_vector Elements(Context); for (int I : Slot.Instructions) { @@ -242,7 +276,7 @@ bool Z3Solver::genSlotConstraint(int SlotNo, const Slot &Slot) { return true; } -void Z3Solver::genConflict(int M, int N) { +void Z3BinarySolver::genConflict(int M, int N) { z3::expr_vector Elements(Context); // All stages have a contribution to a particular cycle) for (int S = 0; S < NumStages; S++) { @@ -254,47 +288,7 @@ void Z3Solver::genConflict(int M, int N) { Solver.add(z3::atmost(Elements, 1)); } -bool Z3Solver::genModel() { - vars(); - slots(); - scheduled(); - cycles(); - latencies(); - conflicts(); - - LLVM_DEBUG(dbgs() << "II=" << II << " NS=" << NumStages << "\n"); - switch (Solver.check()) { - case z3::unsat: - std::cout << "Unsatisfiable\n"; - return false; - - case z3::unknown: - std::cout << "Unknown\n"; - return false; - default: - return true; - } -} - -std::vector Z3Solver::getCycles() { - z3::model M = Solver.get_model(); - std::cout << "Satisfied by:\n"; - // All the information is in the variables that are true - for (auto &V : VarDecls) { - if (V && M.eval(*V).is_true()) { - std::cout << V->decl().name() << "\n"; - } - } - std::vector Cycles; - for (auto &C : CycleExprs) { - auto Val = M.eval(C); - int IntVal; - Z3_get_numeral_int(Context, Val, &IntVal); - Cycles.push_back(IntVal); - } - return Cycles; -} void Z3IntegerSolver::vars() { // Create the stage and cycle vars diff --git a/llvm/lib/Target/AIE/AIESWPSolver.h b/llvm/lib/Target/AIE/AIESWPSolver.h index 26bd12e8eb01..0d2f5f516915 100644 --- a/llvm/lib/Target/AIE/AIESWPSolver.h +++ b/llvm/lib/Target/AIE/AIESWPSolver.h @@ -72,7 +72,6 @@ class SWPSolver { // associated instruction bool feasible(int I, int S, int C); - virtual bool genModel() = 0; // Generate the slot constraint for the given slot virtual bool genSlotConstraint(int SlotNo, const Slot &Slot) = 0; // Generate a constraint that represents a dependence latency @@ -95,20 +94,41 @@ class SWPSolver { void addLatency(int Src, int Dst, int Latency, int Distance = 0); // Set the desired schedule size in terms of II and number of stages void setScheduleSize(int II, int NS); + virtual bool genModel() = 0; }; -// In the binary formulation, we have a lot of binary variables, -// but relatively few, though elaborate constraints. -class Z3Solver : public SWPSolver { - std::vector> VarDecls; +class Z3Solver : public SWPSolver { protected: z3::context Context; z3::solver Solver; + // Frequently used subexpression z3::expr Zero; - - // expression for the cycle of each instruction. + // Expressions for the cycle of each instruction. std::vector CycleExprs; + virtual ~Z3Solver() = default; + + // generate the variables for each instruction + virtual void vars() = 0; + // generate the constraint that all instructions must be scheduled + virtual void scheduled() = 0; + + // Return a z3 expression that represents the cycle of an instruction + // in the linear schedule. + virtual z3::expr genCycle(int I) = 0; + + // generate expressions for the cycle of each instruction + void cycles(); +public: + Z3Solver(); + bool genModel() override; + std::vector getCycles() override; +}; + +// In the binary formulation, we have a lot of binary variables, +// but relatively few, though elaborate constraints. +class Z3BinarySolver : public Z3Solver { + std::vector> VarDecls; std::string varname(int N, int S, int C); std::optional vardecl(int N, int S, int C); @@ -116,36 +136,26 @@ class Z3Solver : public SWPSolver { // generate an expression that represents I running in modulo cycle C z3::expr genModuloCycle(int I, int C); - // Return a z3 expression that represents the cycle of an instruction - // in the linear schedule. - virtual z3::expr genCycle(int I); - // If it exists, add a schedule variable declaration to Elements void addVar(int N, int S, int C, z3::expr_vector &Elements); - // generate the variables for each instruction - virtual void vars(); - // generate the constraint that all instructions must be scheduled - virtual void scheduled(); - // generate expressions for the cycle of each instruction - virtual void cycles(); // generate the constraint that only one instance of Slot is present in // cycle C bool genSlotConstraint(int SlotNo, const Slot &Slot) override; bool genLatencyConstraint(const Latency &L) override; void genConflict(int M, int N) override; + z3::expr genCycle(int I) override; + void vars() override; + void scheduled() override; public: - Z3Solver(); - bool genModel() override; - std::vector getCycles() override; + Z3BinarySolver() = default; }; -// In the integer formulation, we have an integer stage and cycle variable -// for each instruction. +// In the integer formulation, we have an integer stage and an integer cycle +// variable for each instruction. // Constraints will be pretty compact, but we will have many of them, // roughly quadratic in the number of instructions -// FIXME: subclass current Z3Solver as Z3BinarySolver class Z3IntegerSolver : public Z3Solver { std::vector StageVarDecls; std::vector CycleVarDecls; @@ -154,6 +164,8 @@ class Z3IntegerSolver : public Z3Solver { z3::expr genCycle(int N) override; void genConflict(int M, int N) override; bool genSlotConstraint(int SlotNo, const Slot &Slot) override; +public: + Z3IntegerSolver() = default; }; class LPFile : public SWPSolver { From d0f207c8afc2e668eca3cdb61ead509a1cec72d2 Mon Sep 17 00:00:00 2001 From: Martien de Jong Date: Tue, 14 Jan 2025 16:07:33 +0100 Subject: [PATCH 3/4] don't reset UseSolver --- llvm/lib/Target/AIE/AIEMachineScheduler.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp index 36a22b6544ea..57fa9ed2ab52 100644 --- a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp +++ b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp @@ -1394,7 +1394,6 @@ void AIEScheduleDAGMI::schedule() { BS.setPipelined(); LLVM_DEBUG(PostSWP.dump()); } - PostSWP.setUseSolver(false); return; } default: From cdd02e0744bc26b7e38736c039ea9bf20f62753d Mon Sep 17 00:00:00 2001 From: Martien de Jong Date: Wed, 15 Jan 2025 10:30:35 +0100 Subject: [PATCH 4/4] [AIE] Protect against empty sets Put in some runtime control, a real one based on timeouts, and a deterministic one pre-deciding based on estimated milliseconds --- llvm/lib/Target/AIE/AIEPostPipeliner.cpp | 13 +- llvm/lib/Target/AIE/AIEPostPipeliner.h | 2 +- llvm/lib/Target/AIE/AIESWPSolver.cpp | 197 +++++++++++++----- llvm/lib/Target/AIE/AIESWPSolver.h | 72 ++++++- .../AIE/aie2/schedule/postpipeliner/crash.mir | 6 +- .../AIE/aie2/schedule/postpipeliner/round.mir | 44 ++-- .../aie2/schedule/postpipeliner/small-II.mir | 14 +- 7 files changed, 243 insertions(+), 105 deletions(-) diff --git a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp index 24b48c48d6a1..6f4d2bf611f0 100644 --- a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp +++ b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp @@ -676,7 +676,7 @@ bool PostPipeliner::tryHeuristics() { DEBUG_SUMMARY(dbgs() << "-- MinLength=" << MinLength << "\n"); - if (II <= 10 && solve()) { + if (solve(MinLength / II)) { return true; } @@ -752,13 +752,12 @@ class FixedStrategy : public PostPipelinerStrategy { std::string name() override { return "FixedStrategy"; } }; -bool PostPipeliner::solve() { +bool PostPipeliner::solve(int NS) { if (!UseSolver) { return false; } Z3BinarySolver Solver; - Solver.setScheduleSize(II, 3); for (int N = 0; N < NInstr; N++) { SUnit &SU = DAG->SUnits[N]; auto *MI = SU.getInstr(); @@ -786,7 +785,7 @@ bool PostPipeliner::solve() { } } - // Add loop-carried true dependences to future iterations. The iteration + // Add loop-carried dependences to future iterations. The iteration // distance is taken into account for (int N = 0; N < NInstr; N++) { SUnit &SU = DAG->SUnits[N]; @@ -801,7 +800,11 @@ bool PostPipeliner::solve() { } } - if (!Solver.genModel()) { + Solver.setScheduleSize(II, NS); + Solver.genModel(); + if (!Solver.solveModel()) { + // Note: If we can't solve it, it doesn't mean the II isn't feasible, + // so we don't need to avoid running the heuristics. return false; } auto Schedule = Solver.getCycles(); diff --git a/llvm/lib/Target/AIE/AIEPostPipeliner.h b/llvm/lib/Target/AIE/AIEPostPipeliner.h index 92eaef899934..a3a8c921f4cf 100644 --- a/llvm/lib/Target/AIE/AIEPostPipeliner.h +++ b/llvm/lib/Target/AIE/AIEPostPipeliner.h @@ -190,7 +190,7 @@ class PostPipeliner { int computeMinScheduleLength() const; // try to find a solution using a solver - bool solve(); + bool solve(int NS); /// Try all heuristics, stop at the first that fits the II /// If it returns true, a valid schedule is laid down in Info. diff --git a/llvm/lib/Target/AIE/AIESWPSolver.cpp b/llvm/lib/Target/AIE/AIESWPSolver.cpp index fd79c8716c7e..81af56bd1e4d 100644 --- a/llvm/lib/Target/AIE/AIESWPSolver.cpp +++ b/llvm/lib/Target/AIE/AIESWPSolver.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "AIESWPSolver.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" @@ -20,6 +21,30 @@ namespace llvm::AIE::Solver { +static cl::opt AllowedSolverTime( + "aie-postpipeliner-solver-timeout", + cl::desc("Specify the timeout value for the solver in ms"), cl::init(2000), + cl::Hidden); + +// Time control is safe, but it we will depend on computer performance and +// load whether we find a solution. With this option set, we will estimate +// the time deterministically and use the timeout to call the solver or not. +static cl::opt + DeterministicSolver("aie-postpipeliner-deterministic-solver", + cl::desc("Make the solver behave deterministically"), + cl::init(true), cl::Hidden); + +ProblemSize::ProblemSize() { + for (int C = 0; C < ProblemSize::NumSizeComponents; C++) { + Counts[C] = 0; + } +} +void ProblemSize::dump() { + for (int C = 0; C < ProblemSize::NumSizeComponents; C++) { + dbgs() << Counts[C] << ", "; + } +} + SWPSolver::~SWPSolver() {} Slot &SWPSolver::addSlot(int N) { @@ -56,8 +81,8 @@ void SWPSolver::setHeight(int I, int H) { } bool SWPSolver::feasible(int I, int S, int C) { - int Cycle = S * II + C; - int Length = NumStages * II; + int Cycle = S * getII() + C; + int Length = NumStages * getII(); return Cycle >= getDepth(I) && Cycle < Length - getHeight(I); } @@ -74,9 +99,9 @@ void SWPSolver::addLatency(int Src, int Dst, int Latency, int Distance) { setHeight(Src, getHeight(Dst) + Latency); } -void SWPSolver::setScheduleSize(int I, int S) { +void SWPSolver::setScheduleSize(int I, int NS) { II = I; - NumStages = S; + NumStages = NS; } void SWPSolver::slots() { @@ -121,7 +146,7 @@ void SWPSolver::latencies() { if (A.Dst != B.Dst) { return A.Dst < B.Dst; } - return (A.Lat - A.Dist * II) > (B.Lat - B.Dist * II); + return (A.Lat - A.Dist * getII()) > (B.Lat - B.Dist * getII()); }); // The latencies between a given pair of nodes are now adjacent @@ -130,7 +155,8 @@ void SWPSolver::latencies() { std::optional Prev; for (const auto &L : Latencies) { if (Prev && L.Src == Prev->Src && L.Dst == Prev->Dst) { - dbgs() << "Skip latency constraint\n"; + LLVM_DEBUG(dbgs() << "Skip latency constraint " << L.Src << " -> " + << L.Dst << " (" << L.Lat << ")\n"); continue; } if (!genLatencyConstraint(L)) { @@ -140,26 +166,76 @@ void SWPSolver::latencies() { } } -Z3Solver::Z3Solver() : Solver(Context), Zero(Context.int_val(0)) {} +Z3Solver::Z3Solver() : Solver(Context), Zero(Context.int_val(0)) { + // timeout behaves undeterministically + if (!DeterministicSolver) { + Z3_params Params = Z3_mk_params(Context); + Z3_params_set_uint(Context, Params, Z3_mk_string_symbol(Context, "timeout"), + AllowedSolverTime); + Z3_solver_set_params(Context, Solver, Params); + } +} + +void Z3Solver::atMost(const z3::expr_vector &Elements, int Limit) { + Solver.add(atmost(nonempty(Elements), Limit)); +} + +void Z3Solver::cycles() { + for (auto &[N, I] : Instructions) { + CycleExprs.push_back(genCycle(N)); + } +} + +std::vector Z3Solver::getCycles() { + z3::model M = Solver.get_model(); + std::vector Cycles; + for (auto &C : CycleExprs) { + auto Val = M.eval(C); + int IntVal; + Z3_get_numeral_int(Context, Val, &IntVal); + Cycles.push_back(IntVal); + } + return Cycles; +} + +void Z3Solver::genModel() { + + // Compute the stage count to make every instruction's interval fit + int Length = 0; + for (auto &[N, I] : Instructions) { + LLVM_DEBUG(dbgs() << I.Depth << " + " << I.Height << "\n"); + Length = std::max(Length, I.Depth + I.Height + 1); + } + const int II = getII(); + NumStages = std::max(NumStages, (Length + II - 1) / II); + + LLVM_DEBUG(dbgs() << "Minimum stage count = " << NumStages << "\n"); -bool Z3Solver::genModel() { vars(); slots(); scheduled(); cycles(); latencies(); conflicts(); +} - LLVM_DEBUG(dbgs() << "II=" << II << " NS=" << NumStages << "\n"); +bool Z3Solver::solveModel() { + if (DeterministicSolver) { + const int MilliSeconds = estimateSolverTime(Sizes); + if (MilliSeconds > AllowedSolverTime) { + return false; + } + } + LLVM_DEBUG(dbgs() << "Solving for II=" << getII() << " NS=" << NumStages); switch (Solver.check()) { case z3::unsat: - LLVM_DEBUG(dbgs() << "Unsatisfiable\n"); + LLVM_DEBUG(dbgs() << ": Unsatisfiable\n"); return false; case z3::unknown: - LLVM_DEBUG(dbgs() << "Unknown\n"); + LLVM_DEBUG(dbgs() << ": Unknown\n"); return false; default: - LLVM_DEBUG(dbgs() << "Solved\n"); + LLVM_DEBUG(dbgs() << ": Solved\n"); return true; } } @@ -175,11 +251,12 @@ std::string simpleName(const char *Prefix, int N) { } std::optional Z3BinarySolver::vardecl(int I, int S, int C) { - return VarDecls[(I * NumStages + S) * II + C]; + return VarDecls[(I * NumStages + S) * getII() + C]; } void Z3BinarySolver::vars() { - // Create the scheduled vars + const int II = getII(); + // Create the scheduled vars. for (auto &[N, I] : Instructions) { for (int S = 0; S < NumStages; S++) { for (int C = 0; C < II; C++) { @@ -187,6 +264,7 @@ void Z3BinarySolver::vars() { if (feasible(N, S, C)) { OptVar = Context.bool_const(varname(N, S, C).c_str()); } + Sizes.oneMore(ProblemSize::NVariables); VarDecls.push_back(OptVar); } } @@ -196,12 +274,13 @@ void Z3BinarySolver::scheduled() { for (auto &[N, I] : Instructions) { z3::expr_vector Elements(Context); for (int S = 0; S < NumStages; S++) { - for (int C = 0; C < II; C++) { + for (int C = 0; C < getII(); C++) { addVar(N, S, C, Elements); } } - Solver.add(z3::atmost(Elements, 1)); - Solver.add(z3::atleast(Elements, 1)); + atMost(Elements, 1); + Solver.add(z3::atleast(nonempty(Elements), 1)); + Sizes.oneMore(ProblemSize::NInstrConstraints); } } @@ -215,12 +294,12 @@ void Z3BinarySolver::addVar(int N, int S, int C, z3::expr_vector &Elements) { z3::expr Z3BinarySolver::genCycle(int I) { z3::expr_vector Elements(Context); for (int S = 0; S < NumStages; S++) { - for (int C = 0; C < II; C++) { + for (int C = 0; C < getII(); C++) { std::optional Var = vardecl(I, S, C); if (!Var) { continue; } - int LinearCycle = S * II + C; + int LinearCycle = S * getII() + C; if (!LinearCycle) { continue; } @@ -228,32 +307,20 @@ z3::expr Z3BinarySolver::genCycle(int I) { Elements.push_back(z3::ite(*Var, Factor, Zero)); } } - return z3::sum(Elements); -} - -void Z3Solver::cycles() { - for (auto &[N, I] : Instructions) { - CycleExprs.push_back(genCycle(N)); + if (Elements.empty()) { + // Z3 can't deal with the sum of zero terms + return Zero; } -} - -std::vector Z3Solver::getCycles() { - z3::model M = Solver.get_model(); - std::vector Cycles; - for (auto &C : CycleExprs) { - auto Val = M.eval(C); - int IntVal; - Z3_get_numeral_int(Context, Val, &IntVal); - Cycles.push_back(IntVal); - } - return Cycles; + return z3::sum(Elements); } bool Z3BinarySolver::genLatencyConstraint(const Latency &L) { LLVM_DEBUG(dbgs() << "Add " << L.Src << " --> " << L.Dst << " L=" << L.Lat << " D=" << L.Dist << "\n"); - z3::expr Distance = Context.int_val(L.Lat - L.Dist * II); + z3::expr Distance = Context.int_val(L.Lat - L.Dist * getII()); Solver.add((CycleExprs[L.Dst] - CycleExprs[L.Src]) >= Distance); + Sizes.oneMore(L.Dist ? ProblemSize::NLCDLatencyConstraints + : ProblemSize::NLatencyConstraints); #if 0 if (Solver.check() != z3::sat) { std::cout << " FAILED\n"; @@ -264,14 +331,19 @@ bool Z3BinarySolver::genLatencyConstraint(const Latency &L) { } bool Z3BinarySolver::genSlotConstraint(int SlotNo, const Slot &Slot) { - for (int C = 0; C < II; C++) { + for (int C = 0; C < getII(); C++) { z3::expr_vector Elements(Context); for (int I : Slot.Instructions) { for (int S = 0; S < NumStages; S++) { addVar(I, S, C, Elements); } } - Solver.add(z3::atmost(Elements, 1)); + // Some cycles may not be feasible for all instructions in a slot + if (Elements.empty()) { + continue; + } + Sizes.oneMore(ProblemSize::NSlotConstraints); + atMost(Elements, 1); } return true; } @@ -279,16 +351,32 @@ bool Z3BinarySolver::genSlotConstraint(int SlotNo, const Slot &Slot) { void Z3BinarySolver::genConflict(int M, int N) { z3::expr_vector Elements(Context); // All stages have a contribution to a particular cycle) - for (int S = 0; S < NumStages; S++) { - for (int C = 0; C < II; C++) { + for (int C = 0; C < getII(); C++) { + z3::expr_vector Elements(Context); + for (int S = 0; S < NumStages; S++) { addVar(M, S, C, Elements); addVar(N, S, C, Elements); } + if (Elements.empty()) { + continue; + } + Sizes.oneMore(ProblemSize::NConflicts); + atMost(Elements, 1); } - Solver.add(z3::atmost(Elements, 1)); } - +int Z3BinarySolver::estimateSolverTime(ProblemSize Counts) const { + uint64_t Rows = Counts[ProblemSize::NSlotConstraints]; + Rows += Counts[ProblemSize::NLatencyConstraints]; + Rows += Counts[ProblemSize::NLCDLatencyConstraints]; + Rows += 2 * Counts[ProblemSize::NInstrConstraints]; + Rows += Counts[ProblemSize::NConflicts]; + uint64_t Columns = Counts[ProblemSize::NVariables]; + double Score = .02 * Rows * Columns; + LLVM_DEBUG(dbgs() << "Counts: "; Counts.dump(); dbgs() << "\n"); + LLVM_DEBUG(dbgs() << "EstimatedTime=" << Score << "ms\n"); + return Score; +} void Z3IntegerSolver::vars() { // Create the stage and cycle vars @@ -308,12 +396,12 @@ void Z3IntegerSolver::scheduled() { Solver.add(SV < NumStages); z3::expr &CV = CycleVarDecls[N]; Solver.add(CV >= 0); - Solver.add(CV < II); + Solver.add(CV < getII()); } } z3::expr Z3IntegerSolver::genCycle(int N) { - return StageVarDecls[N] * II + CycleVarDecls[N]; + return StageVarDecls[N] * getII() + CycleVarDecls[N]; } void Z3IntegerSolver::genConflict(int M, int N) { @@ -332,15 +420,12 @@ bool Z3IntegerSolver::genSlotConstraint(int SlotNo, const Slot &Slot) { return true; } -bool LPFile::genModel() { +void LPFile::genModel() { printf("max: ;\n"); scheduled(); conflicts(); latencies(); vars(); - - // Can not extract results without calling external solver. - return false; } // Each instruction needs to be scheduled in some cycle in some stage. @@ -349,7 +434,7 @@ void LPFile::scheduled() { printf("// scheduled I%d constraint\n", I); const char *Plus = ""; for (int S = 0; S < NumStages; S++) { - for (int C = 0; C < II; C++) { + for (int C = 0; C < getII(); C++) { if (!feasible(I, S, C)) { continue; } @@ -368,7 +453,7 @@ void LPFile::vars() { const char *Comma = ""; printf("sos "); for (int S = 0; S < NumStages; S++) { - for (int C = 0; C < II; C++) { + for (int C = 0; C < getII(); C++) { if (!feasible(I, S, C)) { continue; } @@ -383,11 +468,11 @@ void LPFile::vars() { // Generate the expression for the (linear) cycle of an instruction void LPFile::genCycle(int I, int Sign) { for (int S = 0; S < NumStages; S++) { - for (int C = 0; C < II; C++) { + for (int C = 0; C < getII(); C++) { if (!feasible(I, S, C)) { continue; } - int Factor = Sign * (S * II + C); + int Factor = Sign * (S * getII() + C); if (!Factor) { continue; } @@ -399,7 +484,7 @@ void LPFile::genCycle(int I, int Sign) { // Only one occurrence of each slot in each cycle bool LPFile::genSlotConstraint(int SlotNo, const Slot &Slot) { - for (int C = 0; C < II; C++) { + for (int C = 0; C < getII(); C++) { printf("\n// Slot %d, Cycle %d\n", SlotNo, C); const char *Plus = ""; for (int I : Slot.Instructions) { @@ -417,7 +502,7 @@ bool LPFile::genLatencyConstraint(const Latency &L) { printf("// Lat(%d -> %d) = %d (%d)\n", L.Src, L.Dst, L.Lat, L.Dist); genCycle(L.Dst, 1); genCycle(L.Src, -1); - printf(" >= %d;\n", L.Lat - L.Dist * II); + printf(" >= %d;\n", L.Lat - L.Dist * getII()); return true; } diff --git a/llvm/lib/Target/AIE/AIESWPSolver.h b/llvm/lib/Target/AIE/AIESWPSolver.h index 0d2f5f516915..03424de0fc90 100644 --- a/llvm/lib/Target/AIE/AIESWPSolver.h +++ b/llvm/lib/Target/AIE/AIESWPSolver.h @@ -45,15 +45,41 @@ class Instruction { : TheSlot(S), MemoryBanks(MemoryBanks) {} }; +class ProblemSize { +public: + ProblemSize(); + enum SizeComponent { + NVariables, + NInstrConstraints, + NSlotConstraints, + NConflicts, + NLatencyTerms, + NLatencyConstraints, + NLCDLatencyConstraints, + NumSizeComponents + }; + int Counts[NumSizeComponents]; + const int &operator[](SizeComponent I) const { return Counts[I]; } + int &operator[](SizeComponent I) { return Counts[I]; } + void oneMore(SizeComponent C) { Counts[C]++; } + void dump(); +}; + class SWPSolver { + int II = 0; + protected: - int II = 1; - int NumStages = 1; + int NumStages; int NSlots; std::map Slots; std::map Instructions; std::vector Latencies; + int getII() const { + assert(II > 0); + return II; + } + // Add a slot to the problem Slot &addSlot(int N); // Generate the latency constraints @@ -73,8 +99,8 @@ class SWPSolver { bool feasible(int I, int S, int C); // Generate the slot constraint for the given slot - virtual bool genSlotConstraint(int SlotNo, const Slot &Slot) = 0; // Generate a constraint that represents a dependence latency + virtual bool genSlotConstraint(int SlotNo, const Slot &Slot) = 0; virtual bool genLatencyConstraint(const Latency &L) = 0; // Generate a mutual exclusion constraint for instructions M and N in any // cycle @@ -92,9 +118,12 @@ class SWPSolver { // Distance represents the iteration distance, i.e. the number of // cfg backedges it spans. void addLatency(int Src, int Dst, int Latency, int Distance = 0); - // Set the desired schedule size in terms of II and number of stages - void setScheduleSize(int II, int NS); - virtual bool genModel() = 0; + // Set the desired schedule size in terms of II and stage count + void setScheduleSize(int I, int NS = 2); + // Generate the model. + virtual void genModel() = 0; + // Call the solver on the model. Return whether it was satisfiable. + virtual bool solveModel() = 0; }; @@ -106,22 +135,42 @@ class Z3Solver : public SWPSolver { z3::expr Zero; // Expressions for the cycle of each instruction. std::vector CycleExprs; + // Problem size parameters + ProblemSize Sizes; virtual ~Z3Solver() = default; - // generate the variables for each instruction + // Generate the variables for each instruction. As a side effect, it + // computes the stage count as the lowerbound for a fesible solution virtual void vars() = 0; - // generate the constraint that all instructions must be scheduled + // Generate the constraint that all instructions must be scheduled virtual void scheduled() = 0; // Return a z3 expression that represents the cycle of an instruction // in the linear schedule. virtual z3::expr genCycle(int I) = 0; - // generate expressions for the cycle of each instruction + // Generate expressions for the cycle of each instruction. void cycles(); + + // Convenience functions to protect against empty sets. + static const z3::expr_vector &nonempty(const z3::expr_vector &Elements) { + assert(!Elements.empty()); + return Elements; + } + + void atMost(const z3::expr_vector &Elements, int Limit); + + // Return an estimate of the solver time in milliseconds + virtual int estimateSolverTime(ProblemSize Counts) const { + const int SecondsPerMinute = 60; + const int MillisecondsPerSecond = 1000; + return 20 * SecondsPerMinute * MillisecondsPerSecond; + } + public: Z3Solver(); - bool genModel() override; + void genModel() override; + bool solveModel() override; std::vector getCycles() override; }; @@ -147,6 +196,7 @@ class Z3BinarySolver : public Z3Solver { z3::expr genCycle(int I) override; void vars() override; void scheduled() override; + int estimateSolverTime(ProblemSize Counts) const override; public: Z3BinarySolver() = default; @@ -173,7 +223,7 @@ class LPFile : public SWPSolver { const char *const VarFmt = "%sV%d_%d_%d_%d"; public: - bool genModel() override; + void genModel() override; void scheduled(); void vars(); void genCycle(int I, int Sign); diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/crash.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/crash.mir index a5dae2d34a2a..655cdee89a7a 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/crash.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/crash.mir @@ -29,13 +29,13 @@ ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: nopb ; lda r0, [p2, #0]; nops ; nopx ; mov p2, p1; nopv - ; CHECK-NEXT: nopa ; nopx - ; CHECK-NEXT: nop + ; CHECK-NEXT: nopb ; lda r0, [p2, #0]; nops ; nopxm ; nopv + ; CHECK-NEXT: nopx ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop + ; CHECK-NEXT: mov p2, p1 ; CHECK-NEXT: .L_LEnd0: ; CHECK-NEXT: nopb ; nopa ; st r0, [p0, #0]; nopxm ; nopv ; CHECK-NEXT: // %bb.2: // %for.cond.cleanup diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round.mir index 2cee43297f55..58ddcfb836a9 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round.mir @@ -34,47 +34,47 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32; nopxm - ; CHECK-NEXT: nop + ; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32; nopb ; nopxm ; CHECK-NEXT: nop ; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32 + ; CHECK-NEXT: nop ; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32 ; CHECK-NEXT: nop - ; CHECK-NEXT: add.nc lc, r0, #-4 - ; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32; movxm ls, #.LBB0_2 + ; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32; add.nc lc, r0, #-4 + ; CHECK-NEXT: movxm ls, #.LBB0_2 ; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32; movxm le, #.L_LEnd0 ; CHECK-NEXT: nopb ; nopa ; vsrs.s8.s32 wh0, cm0, s1; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm1, s0, [p0], #32; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm0, s0, [p0], #32; vsrs.s8.s32 wh2, cm1, s1; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; vsrs.s8.s32 wh0, cm0, s1; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm1, s0, [p0], #32; nops ; nopx ; vups.s32.s8 cm2, wh0, s1; nopv + ; CHECK-NEXT: nopb ; nopa ; vsrs.s8.s32 wh2, cm1, s1; nopxm ; nopv + ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm0, s0, [p0], #32; nops ; nopxm ; nopv + ; CHECK-NEXT: nopb ; nopa ; vsrs.s8.s32 wh0, cm0, s1; nopx ; vups.s32.s8 cm2, wh0, s1; nopv + ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm1, s0, [p0], #32; nops ; nopxm ; nopv + ; CHECK-NEXT: nopb ; nopa ; vsrs.s8.s32 wh2, cm1, s1; nopx ; vups.s32.s8 cm3, wh2, s1; nopv ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm0, s0, [p0], #32; vsrs.s8.s32 wh2, cm1, s1; nopx ; vups.s32.s8 cm3, wh2, s1; nopv - ; CHECK-NEXT: nopa ; nopb ; nopx ; vsrs.s8.s32 wh0, cm0, s1 - ; CHECK-NEXT: vst.srs.s8.s32 cm2, s0, [p1], #32 + ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm0, s0, [p0], #32; vst.srs.s8.s32 cm2, s0, [p1], #32; nopxm ; nopv + ; CHECK-NEXT: vsrs.s8.s32 wh0, cm0, s1; nopx ; vups.s32.s8 cm2, wh0, s1 + ; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32; vst.srs.s8.s32 cm3, s0, [p1], #32 ; CHECK-NEXT: .L_LEnd0: - ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm1, s0, [p0], #32; vst.srs.s8.s32 cm3, s0, [p1], #32; nopx ; vups.s32.s8 cm2, wh0, s1; nopv - ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup ; CHECK-NEXT: nopb ; nopa ; vsrs.s8.s32 wh2, cm1, s1; nopx ; vups.s32.s8 cm3, wh2, s1; nopv - ; CHECK-NEXT: vsrs.s8.s32 wh0, cm0, s1; nopb ; nopx - ; CHECK-NEXT: vst.srs.s8.s32 cm2, s0, [p1], #32 - ; CHECK-NEXT: vst.srs.s8.s32 cm3, s0, [p1], #32; vups.s32.s8 cm2, wh0, s1 + ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup + ; CHECK-NEXT: vst.srs.s8.s32 cm2, s0, [p1], #32; nopb ; nopx + ; CHECK-NEXT: vsrs.s8.s32 wh0, cm0, s1; vups.s32.s8 cm2, wh0, s1 + ; CHECK-NEXT: vst.srs.s8.s32 cm3, s0, [p1], #32 ; CHECK-NEXT: vsrs.s8.s32 wh2, cm1, s1; vups.s32.s8 cm3, wh2, s1 - ; CHECK-NEXT: vsrs.s8.s32 wh0, cm0, s1 ; CHECK-NEXT: vst.srs.s8.s32 cm2, s0, [p1], #32 - ; CHECK-NEXT: vst.srs.s8.s32 cm3, s0, [p1], #32; vups.s32.s8 cm2, wh0, s1 + ; CHECK-NEXT: vsrs.s8.s32 wh0, cm0, s1; vups.s32.s8 cm2, wh0, s1 + ; CHECK-NEXT: vst.srs.s8.s32 cm3, s0, [p1], #32 ; CHECK-NEXT: vsrs.s8.s32 wh2, cm1, s1; vups.s32.s8 cm3, wh2, s1 - ; CHECK-NEXT: nop ; CHECK-NEXT: vst.srs.s8.s32 cm2, s0, [p1], #32 - ; CHECK-NEXT: vst.srs.s8.s32 cm3, s0, [p1], #32; vups.s32.s8 cm2, wh0, s1 + ; CHECK-NEXT: vups.s32.s8 cm2, wh0, s1 + ; CHECK-NEXT: vst.srs.s8.s32 cm3, s0, [p1], #32 ; CHECK-NEXT: vups.s32.s8 cm3, wh2, s1 - ; CHECK-NEXT: nop ; CHECK-NEXT: vst.srs.s8.s32 cm2, s0, [p1], #32 + ; CHECK-NEXT: nop ; CHECK-NEXT: vst.srs.s8.s32 cm3, s0, [p1], #32 + ; CHECK-NEXT: nop ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup ; CHECK-NEXT: nopa ; ret lr diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/small-II.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/small-II.mir index dd4da83e4766..2ec3a8811bc1 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/small-II.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/small-II.mir @@ -37,24 +37,24 @@ ; CHECK-NEXT: nopb ; lda r0, [p0], #4; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; lda r1, [p0], #4; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; lda r2, [p0], #4; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; lda r3, [p0], #4; st r0, [p1], #4; nopxm ; nopv + ; CHECK-NEXT: nopb ; lda r3, [p0], #4; nops ; nopxm ; nopv ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: nopb ; lda r0, [p0], #4; st r1, [p1], #4; nopxm ; nopv - ; CHECK-NEXT: lda r1, [p0], #4; st r2, [p1], #4; nopx - ; CHECK-NEXT: lda r2, [p0], #4; st r3, [p1], #4 + ; CHECK-NEXT: nopb ; lda r0, [p0], #4; st r0, [p1], #4; nopxm ; nopv + ; CHECK-NEXT: lda r1, [p0], #4; st r1, [p1], #4; nopx + ; CHECK-NEXT: lda r2, [p0], #4; st r2, [p1], #4 ; CHECK-NEXT: .L_LEnd0: - ; CHECK-NEXT: nopb ; lda r3, [p0], #4; st r0, [p1], #4; nopxm ; nopv + ; CHECK-NEXT: nopb ; lda r3, [p0], #4; st r3, [p1], #4; nopxm ; nopv ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup - ; CHECK-NEXT: st r1, [p1], #4; nopx + ; CHECK-NEXT: st r0, [p1], #4 + ; CHECK-NEXT: st r1, [p1], #4 ; CHECK-NEXT: st r2, [p1], #4 ; CHECK-NEXT: st r3, [p1], #4 ; CHECK-NEXT: st r0, [p1], #4 ; CHECK-NEXT: st r1, [p1], #4 ; CHECK-NEXT: st r2, [p1], #4 ; CHECK-NEXT: st r3, [p1], #4 - ; CHECK-NEXT: nop ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup ; CHECK-NEXT: nopa ; ret lr