1//===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Machine Scheduler interface
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIMachineScheduler.h"
15#include "SIInstrInfo.h"
16#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17#include "llvm/CodeGen/LiveIntervals.h"
18#include "llvm/CodeGen/MachineRegisterInfo.h"
19
20using namespace llvm;
21
22#define DEBUG_TYPE "machine-scheduler"
23
24// This scheduler implements a different scheduling algorithm than
25// GenericScheduler.
26//
27// There are several specific architecture behaviours that can't be modelled
28// for GenericScheduler:
29// . When accessing the result of an SGPR load instruction, you have to wait
30// for all the SGPR load instructions before your current instruction to
31// have finished.
32// . When accessing the result of an VGPR load instruction, you have to wait
33// for all the VGPR load instructions previous to the VGPR load instruction
34// you are interested in to finish.
35// . The less the register pressure, the best load latencies are hidden
36//
37// Moreover some specifities (like the fact a lot of instructions in the shader
38// have few dependencies) makes the generic scheduler have some unpredictable
39// behaviours. For example when register pressure becomes high, it can either
40// manage to prevent register pressure from going too high, or it can
41// increase register pressure even more than if it hadn't taken register
42// pressure into account.
43//
44// Also some other bad behaviours are generated, like loading at the beginning
45// of the shader a constant in VGPR you won't need until the end of the shader.
46//
47// The scheduling problem for SI can distinguish three main parts:
48// . Hiding high latencies (texture sampling, etc)
49// . Hiding low latencies (SGPR constant loading, etc)
50// . Keeping register usage low for better latency hiding and general
51// performance
52//
53// Some other things can also affect performance, but are hard to predict
54// (cache usage, the fact the HW can issue several instructions from different
55// wavefronts if different types, etc)
56//
57// This scheduler tries to solve the scheduling problem by dividing it into
58// simpler sub-problems. It divides the instructions into blocks, schedules
59// locally inside the blocks where it takes care of low latencies, and then
60// chooses the order of the blocks by taking care of high latencies.
61// Dividing the instructions into blocks helps control keeping register
62// usage low.
63//
64// First the instructions are put into blocks.
65// We want the blocks help control register usage and hide high latencies
66// later. To help control register usage, we typically want all local
67// computations, when for example you create a result that can be consumed
68// right away, to be contained in a block. Block inputs and outputs would
69// typically be important results that are needed in several locations of
70// the shader. Since we do want blocks to help hide high latencies, we want
71// the instructions inside the block to have a minimal set of dependencies
72// on high latencies. It will make it easy to pick blocks to hide specific
73// high latencies.
74// The block creation algorithm is divided into several steps, and several
75// variants can be tried during the scheduling process.
76//
77// Second the order of the instructions inside the blocks is chosen.
78// At that step we do take into account only register usage and hiding
79// low latency instructions
80//
81// Third the block order is chosen, there we try to hide high latencies
82// and keep register usage low.
83//
84// After the third step, a pass is done to improve the hiding of low
85// latencies.
86//
87// Actually when talking about 'low latency' or 'high latency' it includes
88// both the latency to get the cache (or global mem) data go to the register,
89// and the bandwidth limitations.
90// Increasing the number of active wavefronts helps hide the former, but it
91// doesn't solve the latter, thus why even if wavefront count is high, we have
92// to try have as many instructions hiding high latencies as possible.
93// The OpenCL doc says for example latency of 400 cycles for a global mem
94// access, which is hidden by 10 instructions if the wavefront count is 10.
95
96// Some figures taken from AMD docs:
97// Both texture and constant L1 caches are 4-way associative with 64 bytes
98// lines.
99// Constant cache is shared with 4 CUs.
100// For texture sampling, the address generation unit receives 4 texture
101// addresses per cycle, thus we could expect texture sampling latency to be
102// equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
103// instructions in a wavefront group are executed every 4 cycles),
104// or 16 instructions if the other wavefronts associated to the 3 other VALUs
105// of the CU do texture sampling too. (Don't take these figures too seriously,
106// as I'm not 100% sure of the computation)
107// Data exports should get similar latency.
108// For constant loading, the cache is shader with 4 CUs.
109// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
110// I guess if the other CU don't read the cache, it can go up to 64B/cycle.
111// It means a simple s_buffer_load should take one instruction to hide, as
112// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
113// cache line.
114//
115// As of today the driver doesn't preload the constants in cache, thus the
116// first loads get extra latency. The doc says global memory access can be
117// 300-600 cycles. We do not specially take that into account when scheduling
118// As we expect the driver to be able to preload the constants soon.
119
120// common code //
121
122#ifndef NDEBUG
123
124static const char *getReasonStr(SIScheduleCandReason Reason) {
125 switch (Reason) {
126 case NoCand: return "NOCAND";
127 case RegUsage: return "REGUSAGE";
128 case Latency: return "LATENCY";
129 case Successor: return "SUCCESSOR";
130 case Depth: return "DEPTH";
131 case NodeOrder: return "ORDER";
132 }
133 llvm_unreachable("Unknown reason!");
134}
135
136#endif
137
138namespace llvm {
139namespace SISched {
140static bool tryLess(int TryVal, int CandVal,
141 SISchedulerCandidate &TryCand,
142 SISchedulerCandidate &Cand,
143 SIScheduleCandReason Reason) {
144 if (TryVal < CandVal) {
145 TryCand.Reason = Reason;
146 return true;
147 }
148 if (TryVal > CandVal) {
149 if (Cand.Reason > Reason)
150 Cand.Reason = Reason;
151 return true;
152 }
153 Cand.setRepeat(Reason);
154 return false;
155}
156
157static bool tryGreater(int TryVal, int CandVal,
158 SISchedulerCandidate &TryCand,
159 SISchedulerCandidate &Cand,
160 SIScheduleCandReason Reason) {
161 if (TryVal > CandVal) {
162 TryCand.Reason = Reason;
163 return true;
164 }
165 if (TryVal < CandVal) {
166 if (Cand.Reason > Reason)
167 Cand.Reason = Reason;
168 return true;
169 }
170 Cand.setRepeat(Reason);
171 return false;
172}
173} // end namespace SISched
174} // end namespace llvm
175
176// SIScheduleBlock //
177
178void SIScheduleBlock::addUnit(SUnit *SU) {
179 NodeNum2Index[SU->NodeNum] = SUnits.size();
180 SUnits.push_back(x: SU);
181}
182
183#ifndef NDEBUG
184void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
185
186 dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Reason: Cand.Reason);
187 dbgs() << '\n';
188}
189#endif
190
191void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
192 SISchedCandidate &TryCand) {
193 // Initialize the candidate if needed.
194 if (!Cand.isValid()) {
195 TryCand.Reason = NodeOrder;
196 return;
197 }
198
199 if (Cand.SGPRUsage > 60 &&
200 SISched::tryLess(TryVal: TryCand.SGPRUsage, CandVal: Cand.SGPRUsage,
201 TryCand, Cand, Reason: RegUsage))
202 return;
203
204 // Schedule low latency instructions as top as possible.
205 // Order of priority is:
206 // . Low latency instructions which do not depend on other low latency
207 // instructions we haven't waited for
208 // . Other instructions which do not depend on low latency instructions
209 // we haven't waited for
210 // . Low latencies
211 // . All other instructions
212 // Goal is to get: low latency instructions - independent instructions
213 // - (eventually some more low latency instructions)
214 // - instructions that depend on the first low latency instructions.
215 // If in the block there is a lot of constant loads, the SGPR usage
216 // could go quite high, thus above the arbitrary limit of 60 will encourage
217 // use the already loaded constants (in order to release some SGPRs) before
218 // loading more.
219 if (SISched::tryLess(TryVal: TryCand.HasLowLatencyNonWaitedParent,
220 CandVal: Cand.HasLowLatencyNonWaitedParent,
221 TryCand, Cand, Reason: SIScheduleCandReason::Depth))
222 return;
223
224 if (SISched::tryGreater(TryVal: TryCand.IsLowLatency, CandVal: Cand.IsLowLatency,
225 TryCand, Cand, Reason: SIScheduleCandReason::Depth))
226 return;
227
228 if (TryCand.IsLowLatency &&
229 SISched::tryLess(TryVal: TryCand.LowLatencyOffset, CandVal: Cand.LowLatencyOffset,
230 TryCand, Cand, Reason: SIScheduleCandReason::Depth))
231 return;
232
233 if (SISched::tryLess(TryVal: TryCand.VGPRUsage, CandVal: Cand.VGPRUsage,
234 TryCand, Cand, Reason: RegUsage))
235 return;
236
237 // Fall through to original instruction order.
238 if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
239 TryCand.Reason = NodeOrder;
240 }
241}
242
243SUnit* SIScheduleBlock::pickNode() {
244 SISchedCandidate TopCand;
245
246 for (SUnit* SU : TopReadySUs) {
247 SISchedCandidate TryCand;
248 std::vector<unsigned> pressure;
249 std::vector<unsigned> MaxPressure;
250 // Predict register usage after this instruction.
251 TryCand.SU = SU;
252 TopRPTracker.getDownwardPressure(MI: SU->getInstr(), PressureResult&: pressure, MaxPressureResult&: MaxPressure);
253 TryCand.SGPRUsage = pressure[AMDGPU::RegisterPressureSets::SReg_32];
254 TryCand.VGPRUsage = pressure[AMDGPU::RegisterPressureSets::VGPR_32];
255 TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum];
256 TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum];
257 TryCand.HasLowLatencyNonWaitedParent =
258 HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]];
259 tryCandidateTopDown(Cand&: TopCand, TryCand);
260 if (TryCand.Reason != NoCand)
261 TopCand.setBest(TryCand);
262 }
263
264 return TopCand.SU;
265}
266
267
268// Schedule something valid.
269void SIScheduleBlock::fastSchedule() {
270 TopReadySUs.clear();
271 if (Scheduled)
272 undoSchedule();
273
274 for (SUnit* SU : SUnits) {
275 if (!SU->NumPredsLeft)
276 TopReadySUs.push_back(x: SU);
277 }
278
279 while (!TopReadySUs.empty()) {
280 SUnit *SU = TopReadySUs[0];
281 ScheduledSUnits.push_back(x: SU);
282 nodeScheduled(SU);
283 }
284
285 Scheduled = true;
286}
287
288// Returns if the register was set between first and last.
289static bool isDefBetween(unsigned Reg,
290 SlotIndex First, SlotIndex Last,
291 const MachineRegisterInfo *MRI,
292 const LiveIntervals *LIS) {
293 for (MachineRegisterInfo::def_instr_iterator
294 UI = MRI->def_instr_begin(RegNo: Reg),
295 UE = MRI->def_instr_end(); UI != UE; ++UI) {
296 const MachineInstr* MI = &*UI;
297 if (MI->isDebugValue())
298 continue;
299 SlotIndex InstSlot = LIS->getInstructionIndex(Instr: *MI).getRegSlot();
300 if (InstSlot >= First && InstSlot <= Last)
301 return true;
302 }
303 return false;
304}
305
306void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
307 MachineBasicBlock::iterator EndBlock) {
308 IntervalPressure Pressure, BotPressure;
309 RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
310 LiveIntervals *LIS = DAG->getLIS();
311 MachineRegisterInfo *MRI = DAG->getMRI();
312 DAG->initRPTracker(RPTracker&: TopRPTracker);
313 DAG->initRPTracker(RPTracker&: BotRPTracker);
314 DAG->initRPTracker(RPTracker);
315
316 // Goes though all SU. RPTracker captures what had to be alive for the SUs
317 // to execute, and what is still alive at the end.
318 for (SUnit* SU : ScheduledSUnits) {
319 RPTracker.setPos(SU->getInstr());
320 RPTracker.advance();
321 }
322
323 // Close the RPTracker to finalize live ins/outs.
324 RPTracker.closeRegion();
325
326 // Initialize the live ins and live outs.
327 TopRPTracker.addLiveRegs(Regs: RPTracker.getPressure().LiveInRegs);
328 BotRPTracker.addLiveRegs(Regs: RPTracker.getPressure().LiveOutRegs);
329
330 // Do not Track Physical Registers, because it messes up.
331 for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
332 if (RegMaskPair.RegUnit.isVirtual())
333 LiveInRegs.insert(x: RegMaskPair.RegUnit);
334 }
335 LiveOutRegs.clear();
336 // There is several possibilities to distinguish:
337 // 1) Reg is not input to any instruction in the block, but is output of one
338 // 2) 1) + read in the block and not needed after it
339 // 3) 1) + read in the block but needed in another block
340 // 4) Reg is input of an instruction but another block will read it too
341 // 5) Reg is input of an instruction and then rewritten in the block.
342 // result is not read in the block (implies used in another block)
343 // 6) Reg is input of an instruction and then rewritten in the block.
344 // result is read in the block and not needed in another block
345 // 7) Reg is input of an instruction and then rewritten in the block.
346 // result is read in the block but also needed in another block
347 // LiveInRegs will contains all the regs in situation 4, 5, 6, 7
348 // We want LiveOutRegs to contain only Regs whose content will be read after
349 // in another block, and whose content was written in the current block,
350 // that is we want it to get 1, 3, 5, 7
351 // Since we made the MIs of a block to be packed all together before
352 // scheduling, then the LiveIntervals were correct, and the RPTracker was
353 // able to correctly handle 5 vs 6, 2 vs 3.
354 // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
355 // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
356 // Comparing to LiveInRegs is not sufficient to differentiate 4 vs 5, 7
357 // The use of findDefBetween removes the case 4.
358 for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
359 Register Reg = RegMaskPair.RegUnit;
360 if (Reg.isVirtual() &&
361 isDefBetween(Reg, First: LIS->getInstructionIndex(Instr: *BeginBlock).getRegSlot(),
362 Last: LIS->getInstructionIndex(Instr: *EndBlock).getRegSlot(), MRI,
363 LIS)) {
364 LiveOutRegs.insert(x: Reg);
365 }
366 }
367
368 // Pressure = sum_alive_registers register size
369 // Internally llvm will represent some registers as big 128 bits registers
370 // for example, but they actually correspond to 4 actual 32 bits registers.
371 // Thus Pressure is not equal to num_alive_registers * constant.
372 LiveInPressure = TopPressure.MaxSetPressure;
373 LiveOutPressure = BotPressure.MaxSetPressure;
374
375 // Prepares TopRPTracker for top down scheduling.
376 TopRPTracker.closeTop();
377}
378
379void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
380 MachineBasicBlock::iterator EndBlock) {
381 if (!Scheduled)
382 fastSchedule();
383
384 // PreScheduling phase to set LiveIn and LiveOut.
385 initRegPressure(BeginBlock, EndBlock);
386 undoSchedule();
387
388 // Schedule for real now.
389
390 TopReadySUs.clear();
391
392 for (SUnit* SU : SUnits) {
393 if (!SU->NumPredsLeft)
394 TopReadySUs.push_back(x: SU);
395 }
396
397 while (!TopReadySUs.empty()) {
398 SUnit *SU = pickNode();
399 ScheduledSUnits.push_back(x: SU);
400 TopRPTracker.setPos(SU->getInstr());
401 TopRPTracker.advance();
402 nodeScheduled(SU);
403 }
404
405 // TODO: compute InternalAdditionalPressure.
406 InternalAdditionalPressure.resize(new_size: TopPressure.MaxSetPressure.size());
407
408 // Check everything is right.
409#ifndef NDEBUG
410 assert(SUnits.size() == ScheduledSUnits.size() &&
411 TopReadySUs.empty());
412 for (SUnit* SU : SUnits) {
413 assert(SU->isScheduled &&
414 SU->NumPredsLeft == 0);
415 }
416#endif
417
418 Scheduled = true;
419}
420
421void SIScheduleBlock::undoSchedule() {
422 for (SUnit* SU : SUnits) {
423 SU->isScheduled = false;
424 for (SDep& Succ : SU->Succs) {
425 if (BC->isSUInBlock(SU: Succ.getSUnit(), ID))
426 undoReleaseSucc(SU, SuccEdge: &Succ);
427 }
428 }
429 HasLowLatencyNonWaitedParent.assign(n: SUnits.size(), val: 0);
430 ScheduledSUnits.clear();
431 Scheduled = false;
432}
433
434void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) {
435 SUnit *SuccSU = SuccEdge->getSUnit();
436
437 if (SuccEdge->isWeak()) {
438 ++SuccSU->WeakPredsLeft;
439 return;
440 }
441 ++SuccSU->NumPredsLeft;
442}
443
444void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) {
445 SUnit *SuccSU = SuccEdge->getSUnit();
446
447 if (SuccEdge->isWeak()) {
448 --SuccSU->WeakPredsLeft;
449 return;
450 }
451#ifndef NDEBUG
452 if (SuccSU->NumPredsLeft == 0) {
453 dbgs() << "*** Scheduling failed! ***\n";
454 DAG->dumpNode(SU: *SuccSU);
455 dbgs() << " has been released too many times!\n";
456 llvm_unreachable(nullptr);
457 }
458#endif
459
460 --SuccSU->NumPredsLeft;
461}
462
463/// Release Successors of the SU that are in the block or not.
464void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
465 for (SDep& Succ : SU->Succs) {
466 SUnit *SuccSU = Succ.getSUnit();
467
468 if (SuccSU->NodeNum >= DAG->SUnits.size())
469 continue;
470
471 if (BC->isSUInBlock(SU: SuccSU, ID) != InOrOutBlock)
472 continue;
473
474 releaseSucc(SU, SuccEdge: &Succ);
475 if (SuccSU->NumPredsLeft == 0 && InOrOutBlock)
476 TopReadySUs.push_back(x: SuccSU);
477 }
478}
479
480void SIScheduleBlock::nodeScheduled(SUnit *SU) {
481 // Is in TopReadySUs
482 assert (!SU->NumPredsLeft);
483 std::vector<SUnit *>::iterator I = llvm::find(Range&: TopReadySUs, Val: SU);
484 if (I == TopReadySUs.end()) {
485 dbgs() << "Data Structure Bug in SI Scheduler\n";
486 llvm_unreachable(nullptr);
487 }
488 TopReadySUs.erase(position: I);
489
490 releaseSuccessors(SU, InOrOutBlock: true);
491 // Scheduling this node will trigger a wait,
492 // thus propagate to other instructions that they do not need to wait either.
493 if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]])
494 HasLowLatencyNonWaitedParent.assign(n: SUnits.size(), val: 0);
495
496 if (DAG->IsLowLatencySU[SU->NodeNum]) {
497 for (SDep& Succ : SU->Succs) {
498 std::map<unsigned, unsigned>::iterator I =
499 NodeNum2Index.find(x: Succ.getSUnit()->NodeNum);
500 if (I != NodeNum2Index.end())
501 HasLowLatencyNonWaitedParent[I->second] = 1;
502 }
503 }
504 SU->isScheduled = true;
505}
506
507void SIScheduleBlock::finalizeUnits() {
508 // We remove links from outside blocks to enable scheduling inside the block.
509 for (SUnit* SU : SUnits) {
510 releaseSuccessors(SU, InOrOutBlock: false);
511 if (DAG->IsHighLatencySU[SU->NodeNum])
512 HighLatencyBlock = true;
513 }
514 HasLowLatencyNonWaitedParent.resize(new_size: SUnits.size(), x: 0);
515}
516
517// we maintain ascending order of IDs
518void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
519 unsigned PredID = Pred->getID();
520
521 // Check if not already predecessor.
522 for (SIScheduleBlock* P : Preds) {
523 if (PredID == P->getID())
524 return;
525 }
526 Preds.push_back(x: Pred);
527
528 assert(none_of(Succs,
529 [=](std::pair<SIScheduleBlock*,
530 SIScheduleBlockLinkKind> S) {
531 return PredID == S.first->getID();
532 }) &&
533 "Loop in the Block Graph!");
534}
535
536void SIScheduleBlock::addSucc(SIScheduleBlock *Succ,
537 SIScheduleBlockLinkKind Kind) {
538 unsigned SuccID = Succ->getID();
539
540 // Check if not already predecessor.
541 for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) {
542 if (SuccID == S.first->getID()) {
543 if (S.second == SIScheduleBlockLinkKind::NoData &&
544 Kind == SIScheduleBlockLinkKind::Data)
545 S.second = Kind;
546 return;
547 }
548 }
549 if (Succ->isHighLatencyBlock())
550 ++NumHighLatencySuccessors;
551 Succs.push_back(x: std::pair(Succ, Kind));
552
553 assert(none_of(Preds,
554 [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) &&
555 "Loop in the Block Graph!");
556}
557
558#ifndef NDEBUG
559void SIScheduleBlock::printDebug(bool full) {
560 dbgs() << "Block (" << ID << ")\n";
561 if (!full)
562 return;
563
564 dbgs() << "\nContains High Latency Instruction: "
565 << HighLatencyBlock << '\n';
566 dbgs() << "\nDepends On:\n";
567 for (SIScheduleBlock* P : Preds) {
568 P->printDebug(full: false);
569 }
570
571 dbgs() << "\nSuccessors:\n";
572 for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) {
573 if (S.second == SIScheduleBlockLinkKind::Data)
574 dbgs() << "(Data Dep) ";
575 S.first->printDebug(full: false);
576 }
577
578 if (Scheduled) {
579 dbgs() << "LiveInPressure "
580 << LiveInPressure[AMDGPU::RegisterPressureSets::SReg_32] << ' '
581 << LiveInPressure[AMDGPU::RegisterPressureSets::VGPR_32] << '\n';
582 dbgs() << "LiveOutPressure "
583 << LiveOutPressure[AMDGPU::RegisterPressureSets::SReg_32] << ' '
584 << LiveOutPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n\n";
585 dbgs() << "LiveIns:\n";
586 for (unsigned Reg : LiveInRegs)
587 dbgs() << printVRegOrUnit(VRegOrUnit: Reg, TRI: DAG->getTRI()) << ' ';
588
589 dbgs() << "\nLiveOuts:\n";
590 for (unsigned Reg : LiveOutRegs)
591 dbgs() << printVRegOrUnit(VRegOrUnit: Reg, TRI: DAG->getTRI()) << ' ';
592 }
593
594 dbgs() << "\nInstructions:\n";
595 for (const SUnit* SU : SUnits)
596 DAG->dumpNode(SU: *SU);
597
598 dbgs() << "///////////////////////\n";
599}
600#endif
601
602// SIScheduleBlockCreator //
603
604SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG)
605 : DAG(DAG) {}
606
607SIScheduleBlocks
608SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
609 std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
610 Blocks.find(x: BlockVariant);
611 if (B == Blocks.end()) {
612 SIScheduleBlocks Res;
613 createBlocksForVariant(BlockVariant);
614 topologicalSort();
615 scheduleInsideBlocks();
616 fillStats();
617 Res.Blocks = CurrentBlocks;
618 Res.TopDownIndex2Block = TopDownIndex2Block;
619 Res.TopDownBlock2Index = TopDownBlock2Index;
620 Blocks[BlockVariant] = Res;
621 return Res;
622 } else {
623 return B->second;
624 }
625}
626
627bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) {
628 if (SU->NodeNum >= DAG->SUnits.size())
629 return false;
630 return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID;
631}
632
633void SIScheduleBlockCreator::colorHighLatenciesAlone() {
634 unsigned DAGSize = DAG->SUnits.size();
635
636 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
637 SUnit *SU = &DAG->SUnits[i];
638 if (DAG->IsHighLatencySU[SU->NodeNum]) {
639 CurrentColoring[SU->NodeNum] = NextReservedID++;
640 }
641 }
642}
643
644static bool
645hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) {
646 for (const auto &PredDep : SU.Preds) {
647 if (PredDep.getSUnit() == &FromSU &&
648 PredDep.getKind() == llvm::SDep::Data)
649 return true;
650 }
651 return false;
652}
653
654void SIScheduleBlockCreator::colorHighLatenciesGroups() {
655 unsigned DAGSize = DAG->SUnits.size();
656 unsigned NumHighLatencies = 0;
657 unsigned GroupSize;
658 int Color = NextReservedID;
659 unsigned Count = 0;
660 std::set<unsigned> FormingGroup;
661
662 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
663 SUnit *SU = &DAG->SUnits[i];
664 if (DAG->IsHighLatencySU[SU->NodeNum])
665 ++NumHighLatencies;
666 }
667
668 if (NumHighLatencies == 0)
669 return;
670
671 if (NumHighLatencies <= 6)
672 GroupSize = 2;
673 else if (NumHighLatencies <= 12)
674 GroupSize = 3;
675 else
676 GroupSize = 4;
677
678 for (unsigned SUNum : DAG->TopDownIndex2SU) {
679 const SUnit &SU = DAG->SUnits[SUNum];
680 if (DAG->IsHighLatencySU[SU.NodeNum]) {
681 unsigned CompatibleGroup = true;
682 int ProposedColor = Color;
683 std::vector<int> AdditionalElements;
684
685 // We don't want to put in the same block
686 // two high latency instructions that depend
687 // on each other.
688 // One way would be to check canAddEdge
689 // in both directions, but that currently is not
690 // enough because there the high latency order is
691 // enforced (via links).
692 // Instead, look at the dependencies between the
693 // high latency instructions and deduce if it is
694 // a data dependency or not.
695 for (unsigned j : FormingGroup) {
696 bool HasSubGraph;
697 std::vector<int> SubGraph;
698 // By construction (topological order), if SU and
699 // DAG->SUnits[j] are linked, DAG->SUnits[j] is necessary
700 // in the parent graph of SU.
701#ifndef NDEBUG
702 SubGraph = DAG->GetTopo()->GetSubGraph(StartSU: SU, TargetSU: DAG->SUnits[j],
703 Success&: HasSubGraph);
704 assert(!HasSubGraph);
705#endif
706 SubGraph = DAG->GetTopo()->GetSubGraph(StartSU: DAG->SUnits[j], TargetSU: SU,
707 Success&: HasSubGraph);
708 if (!HasSubGraph)
709 continue; // No dependencies between each other
710 else if (SubGraph.size() > 5) {
711 // Too many elements would be required to be added to the block.
712 CompatibleGroup = false;
713 break;
714 }
715 else {
716 // Check the type of dependency
717 for (unsigned k : SubGraph) {
718 // If in the path to join the two instructions,
719 // there is another high latency instruction,
720 // or instructions colored for another block
721 // abort the merge.
722 if (DAG->IsHighLatencySU[k] ||
723 (CurrentColoring[k] != ProposedColor &&
724 CurrentColoring[k] != 0)) {
725 CompatibleGroup = false;
726 break;
727 }
728 // If one of the SU in the subgraph depends on the result of SU j,
729 // there'll be a data dependency.
730 if (hasDataDependencyPred(SU: DAG->SUnits[k], FromSU: DAG->SUnits[j])) {
731 CompatibleGroup = false;
732 break;
733 }
734 }
735 if (!CompatibleGroup)
736 break;
737 // Same check for the SU
738 if (hasDataDependencyPred(SU, FromSU: DAG->SUnits[j])) {
739 CompatibleGroup = false;
740 break;
741 }
742 // Add all the required instructions to the block
743 // These cannot live in another block (because they
744 // depend (order dependency) on one of the
745 // instruction in the block, and are required for the
746 // high latency instruction we add.
747 llvm::append_range(C&: AdditionalElements, R&: SubGraph);
748 }
749 }
750 if (CompatibleGroup) {
751 FormingGroup.insert(x: SU.NodeNum);
752 for (unsigned j : AdditionalElements)
753 CurrentColoring[j] = ProposedColor;
754 CurrentColoring[SU.NodeNum] = ProposedColor;
755 ++Count;
756 }
757 // Found one incompatible instruction,
758 // or has filled a big enough group.
759 // -> start a new one.
760 if (!CompatibleGroup) {
761 FormingGroup.clear();
762 Color = ++NextReservedID;
763 ProposedColor = Color;
764 FormingGroup.insert(x: SU.NodeNum);
765 CurrentColoring[SU.NodeNum] = ProposedColor;
766 Count = 0;
767 } else if (Count == GroupSize) {
768 FormingGroup.clear();
769 Color = ++NextReservedID;
770 ProposedColor = Color;
771 Count = 0;
772 }
773 }
774 }
775}
776
777void SIScheduleBlockCreator::colorComputeReservedDependencies() {
778 unsigned DAGSize = DAG->SUnits.size();
779 std::map<std::set<unsigned>, unsigned> ColorCombinations;
780
781 CurrentTopDownReservedDependencyColoring.clear();
782 CurrentBottomUpReservedDependencyColoring.clear();
783
784 CurrentTopDownReservedDependencyColoring.resize(new_size: DAGSize, x: 0);
785 CurrentBottomUpReservedDependencyColoring.resize(new_size: DAGSize, x: 0);
786
787 // Traverse TopDown, and give different colors to SUs depending
788 // on which combination of High Latencies they depend on.
789
790 for (unsigned SUNum : DAG->TopDownIndex2SU) {
791 SUnit *SU = &DAG->SUnits[SUNum];
792 std::set<unsigned> SUColors;
793
794 // Already given.
795 if (CurrentColoring[SU->NodeNum]) {
796 CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
797 CurrentColoring[SU->NodeNum];
798 continue;
799 }
800
801 for (SDep& PredDep : SU->Preds) {
802 SUnit *Pred = PredDep.getSUnit();
803 if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
804 continue;
805 if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0)
806 SUColors.insert(x: CurrentTopDownReservedDependencyColoring[Pred->NodeNum]);
807 }
808 // Color 0 by default.
809 if (SUColors.empty())
810 continue;
811 // Same color than parents.
812 if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
813 CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
814 *SUColors.begin();
815 else {
816 std::map<std::set<unsigned>, unsigned>::iterator Pos =
817 ColorCombinations.find(x: SUColors);
818 if (Pos != ColorCombinations.end()) {
819 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second;
820 } else {
821 CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
822 NextNonReservedID;
823 ColorCombinations[SUColors] = NextNonReservedID++;
824 }
825 }
826 }
827
828 ColorCombinations.clear();
829
830 // Same as before, but BottomUp.
831
832 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
833 SUnit *SU = &DAG->SUnits[SUNum];
834 std::set<unsigned> SUColors;
835
836 // Already given.
837 if (CurrentColoring[SU->NodeNum]) {
838 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
839 CurrentColoring[SU->NodeNum];
840 continue;
841 }
842
843 for (SDep& SuccDep : SU->Succs) {
844 SUnit *Succ = SuccDep.getSUnit();
845 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
846 continue;
847 if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0)
848 SUColors.insert(x: CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]);
849 }
850 // Keep color 0.
851 if (SUColors.empty())
852 continue;
853 // Same color than parents.
854 if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
855 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
856 *SUColors.begin();
857 else {
858 std::map<std::set<unsigned>, unsigned>::iterator Pos =
859 ColorCombinations.find(x: SUColors);
860 if (Pos != ColorCombinations.end()) {
861 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second;
862 } else {
863 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
864 NextNonReservedID;
865 ColorCombinations[SUColors] = NextNonReservedID++;
866 }
867 }
868 }
869}
870
871void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
872 std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
873
874 // Every combination of colors given by the top down
875 // and bottom up Reserved node dependency
876
877 for (const SUnit &SU : DAG->SUnits) {
878 std::pair<unsigned, unsigned> SUColors;
879
880 // High latency instructions: already given.
881 if (CurrentColoring[SU.NodeNum])
882 continue;
883
884 SUColors.first = CurrentTopDownReservedDependencyColoring[SU.NodeNum];
885 SUColors.second = CurrentBottomUpReservedDependencyColoring[SU.NodeNum];
886
887 std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
888 ColorCombinations.find(x: SUColors);
889 if (Pos != ColorCombinations.end()) {
890 CurrentColoring[SU.NodeNum] = Pos->second;
891 } else {
892 CurrentColoring[SU.NodeNum] = NextNonReservedID;
893 ColorCombinations[SUColors] = NextNonReservedID++;
894 }
895 }
896}
897
898void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
899 unsigned DAGSize = DAG->SUnits.size();
900 std::vector<int> PendingColoring = CurrentColoring;
901
902 assert(DAGSize >= 1 &&
903 CurrentBottomUpReservedDependencyColoring.size() == DAGSize &&
904 CurrentTopDownReservedDependencyColoring.size() == DAGSize);
905 // If there is no reserved block at all, do nothing. We don't want
906 // everything in one block.
907 if (*llvm::max_element(Range&: CurrentBottomUpReservedDependencyColoring) == 0 &&
908 *llvm::max_element(Range&: CurrentTopDownReservedDependencyColoring) == 0)
909 return;
910
911 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
912 SUnit *SU = &DAG->SUnits[SUNum];
913 std::set<unsigned> SUColors;
914 std::set<unsigned> SUColorsPending;
915
916 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
917 continue;
918
919 if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 ||
920 CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0)
921 continue;
922
923 for (SDep& SuccDep : SU->Succs) {
924 SUnit *Succ = SuccDep.getSUnit();
925 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
926 continue;
927 if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 ||
928 CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0)
929 SUColors.insert(x: CurrentColoring[Succ->NodeNum]);
930 SUColorsPending.insert(x: PendingColoring[Succ->NodeNum]);
931 }
932 // If there is only one child/parent block, and that block
933 // is not among the ones we are removing in this path, then
934 // merge the instruction to that block
935 if (SUColors.size() == 1 && SUColorsPending.size() == 1)
936 PendingColoring[SU->NodeNum] = *SUColors.begin();
937 else // TODO: Attribute new colors depending on color
938 // combination of children.
939 PendingColoring[SU->NodeNum] = NextNonReservedID++;
940 }
941 CurrentColoring = PendingColoring;
942}
943
944
945void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
946 unsigned DAGSize = DAG->SUnits.size();
947 unsigned PreviousColor;
948 std::set<unsigned> SeenColors;
949
950 if (DAGSize <= 1)
951 return;
952
953 PreviousColor = CurrentColoring[0];
954
955 for (unsigned i = 1, e = DAGSize; i != e; ++i) {
956 SUnit *SU = &DAG->SUnits[i];
957 unsigned CurrentColor = CurrentColoring[i];
958 unsigned PreviousColorSave = PreviousColor;
959 assert(i == SU->NodeNum);
960
961 if (CurrentColor != PreviousColor)
962 SeenColors.insert(x: PreviousColor);
963 PreviousColor = CurrentColor;
964
965 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
966 continue;
967
968 if (SeenColors.find(x: CurrentColor) == SeenColors.end())
969 continue;
970
971 if (PreviousColorSave != CurrentColor)
972 CurrentColoring[i] = NextNonReservedID++;
973 else
974 CurrentColoring[i] = CurrentColoring[i-1];
975 }
976}
977
978void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
979 unsigned DAGSize = DAG->SUnits.size();
980
981 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
982 SUnit *SU = &DAG->SUnits[SUNum];
983 std::set<unsigned> SUColors;
984
985 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
986 continue;
987
988 // No predecessor: Vgpr constant loading.
989 // Low latency instructions usually have a predecessor (the address)
990 if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum])
991 continue;
992
993 for (SDep& SuccDep : SU->Succs) {
994 SUnit *Succ = SuccDep.getSUnit();
995 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
996 continue;
997 SUColors.insert(x: CurrentColoring[Succ->NodeNum]);
998 }
999 if (SUColors.size() == 1)
1000 CurrentColoring[SU->NodeNum] = *SUColors.begin();
1001 }
1002}
1003
1004void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
1005 unsigned DAGSize = DAG->SUnits.size();
1006
1007 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1008 SUnit *SU = &DAG->SUnits[SUNum];
1009 std::set<unsigned> SUColors;
1010
1011 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1012 continue;
1013
1014 for (SDep& SuccDep : SU->Succs) {
1015 SUnit *Succ = SuccDep.getSUnit();
1016 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1017 continue;
1018 SUColors.insert(x: CurrentColoring[Succ->NodeNum]);
1019 }
1020 if (SUColors.size() == 1)
1021 CurrentColoring[SU->NodeNum] = *SUColors.begin();
1022 }
1023}
1024
1025void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
1026 unsigned DAGSize = DAG->SUnits.size();
1027
1028 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1029 SUnit *SU = &DAG->SUnits[SUNum];
1030 std::set<unsigned> SUColors;
1031
1032 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1033 continue;
1034
1035 for (SDep& SuccDep : SU->Succs) {
1036 SUnit *Succ = SuccDep.getSUnit();
1037 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1038 continue;
1039 SUColors.insert(x: CurrentColoring[Succ->NodeNum]);
1040 }
1041 if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize)
1042 CurrentColoring[SU->NodeNum] = *SUColors.begin();
1043 }
1044}
1045
1046void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
1047 unsigned DAGSize = DAG->SUnits.size();
1048 std::map<unsigned, unsigned> ColorCount;
1049
1050 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1051 SUnit *SU = &DAG->SUnits[SUNum];
1052 unsigned color = CurrentColoring[SU->NodeNum];
1053 ++ColorCount[color];
1054 }
1055
1056 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1057 SUnit *SU = &DAG->SUnits[SUNum];
1058 unsigned color = CurrentColoring[SU->NodeNum];
1059 std::set<unsigned> SUColors;
1060
1061 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1062 continue;
1063
1064 if (ColorCount[color] > 1)
1065 continue;
1066
1067 for (SDep& SuccDep : SU->Succs) {
1068 SUnit *Succ = SuccDep.getSUnit();
1069 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1070 continue;
1071 SUColors.insert(x: CurrentColoring[Succ->NodeNum]);
1072 }
1073 if (SUColors.size() == 1 && *SUColors.begin() != color) {
1074 --ColorCount[color];
1075 CurrentColoring[SU->NodeNum] = *SUColors.begin();
1076 ++ColorCount[*SUColors.begin()];
1077 }
1078 }
1079}
1080
1081void SIScheduleBlockCreator::cutHugeBlocks() {
1082 // TODO
1083}
1084
1085void SIScheduleBlockCreator::regroupNoUserInstructions() {
1086 unsigned DAGSize = DAG->SUnits.size();
1087 int GroupID = NextNonReservedID++;
1088
1089 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1090 SUnit *SU = &DAG->SUnits[SUNum];
1091 bool hasSuccessor = false;
1092
1093 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1094 continue;
1095
1096 for (SDep& SuccDep : SU->Succs) {
1097 SUnit *Succ = SuccDep.getSUnit();
1098 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1099 continue;
1100 hasSuccessor = true;
1101 }
1102 if (!hasSuccessor)
1103 CurrentColoring[SU->NodeNum] = GroupID;
1104 }
1105}
1106
1107void SIScheduleBlockCreator::colorExports() {
1108 unsigned ExportColor = NextNonReservedID++;
1109 SmallVector<unsigned, 8> ExpGroup;
1110
1111 // Put all exports together in a block.
1112 // The block will naturally end up being scheduled last,
1113 // thus putting exports at the end of the schedule, which
1114 // is better for performance.
1115 // However we must ensure, for safety, the exports can be put
1116 // together in the same block without any other instruction.
1117 // This could happen, for example, when scheduling after regalloc
1118 // if reloading a spilled register from memory using the same
1119 // register than used in a previous export.
1120 // If that happens, do not regroup the exports.
1121 for (unsigned SUNum : DAG->TopDownIndex2SU) {
1122 const SUnit &SU = DAG->SUnits[SUNum];
1123 if (SIInstrInfo::isEXP(MI: *SU.getInstr())) {
1124 // SU is an export instruction. Check whether one of its successor
1125 // dependencies is a non-export, in which case we skip export grouping.
1126 for (const SDep &SuccDep : SU.Succs) {
1127 const SUnit *SuccSU = SuccDep.getSUnit();
1128 if (SuccDep.isWeak() || SuccSU->NodeNum >= DAG->SUnits.size()) {
1129 // Ignore these dependencies.
1130 continue;
1131 }
1132 assert(SuccSU->isInstr() &&
1133 "SUnit unexpectedly not representing an instruction!");
1134
1135 if (!SIInstrInfo::isEXP(MI: *SuccSU->getInstr())) {
1136 // A non-export depends on us. Skip export grouping.
1137 // Note that this is a bit pessimistic: We could still group all other
1138 // exports that are not depended on by non-exports, directly or
1139 // indirectly. Simply skipping this particular export but grouping all
1140 // others would not account for indirect dependencies.
1141 return;
1142 }
1143 }
1144 ExpGroup.push_back(Elt: SUNum);
1145 }
1146 }
1147
1148 // The group can be formed. Give the color.
1149 for (unsigned j : ExpGroup)
1150 CurrentColoring[j] = ExportColor;
1151}
1152
1153void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
1154 unsigned DAGSize = DAG->SUnits.size();
1155 std::map<unsigned,unsigned> RealID;
1156
1157 CurrentBlocks.clear();
1158 CurrentColoring.clear();
1159 CurrentColoring.resize(new_size: DAGSize, x: 0);
1160 Node2CurrentBlock.clear();
1161
1162 // Restore links previous scheduling variant has overridden.
1163 DAG->restoreSULinksLeft();
1164
1165 NextReservedID = 1;
1166 NextNonReservedID = DAGSize + 1;
1167
1168 LLVM_DEBUG(dbgs() << "Coloring the graph\n");
1169
1170 if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
1171 colorHighLatenciesGroups();
1172 else
1173 colorHighLatenciesAlone();
1174 colorComputeReservedDependencies();
1175 colorAccordingToReservedDependencies();
1176 colorEndsAccordingToDependencies();
1177 if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive)
1178 colorForceConsecutiveOrderInGroup();
1179 regroupNoUserInstructions();
1180 colorMergeConstantLoadsNextGroup();
1181 colorMergeIfPossibleNextGroupOnlyForReserved();
1182 colorExports();
1183
1184 // Put SUs of same color into same block
1185 Node2CurrentBlock.resize(new_size: DAGSize, x: -1);
1186 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1187 SUnit *SU = &DAG->SUnits[i];
1188 unsigned Color = CurrentColoring[SU->NodeNum];
1189 if (RealID.find(x: Color) == RealID.end()) {
1190 int ID = CurrentBlocks.size();
1191 BlockPtrs.push_back(x: std::make_unique<SIScheduleBlock>(args&: DAG, args: this, args&: ID));
1192 CurrentBlocks.push_back(x: BlockPtrs.rbegin()->get());
1193 RealID[Color] = ID;
1194 }
1195 CurrentBlocks[RealID[Color]]->addUnit(SU);
1196 Node2CurrentBlock[SU->NodeNum] = RealID[Color];
1197 }
1198
1199 // Build dependencies between blocks.
1200 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1201 SUnit *SU = &DAG->SUnits[i];
1202 int SUID = Node2CurrentBlock[i];
1203 for (SDep& SuccDep : SU->Succs) {
1204 SUnit *Succ = SuccDep.getSUnit();
1205 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1206 continue;
1207 if (Node2CurrentBlock[Succ->NodeNum] != SUID)
1208 CurrentBlocks[SUID]->addSucc(Succ: CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]],
1209 Kind: SuccDep.isCtrl() ? NoData : Data);
1210 }
1211 for (SDep& PredDep : SU->Preds) {
1212 SUnit *Pred = PredDep.getSUnit();
1213 if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
1214 continue;
1215 if (Node2CurrentBlock[Pred->NodeNum] != SUID)
1216 CurrentBlocks[SUID]->addPred(Pred: CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]);
1217 }
1218 }
1219
1220 // Free root and leafs of all blocks to enable scheduling inside them.
1221 for (SIScheduleBlock *Block : CurrentBlocks)
1222 Block->finalizeUnits();
1223 LLVM_DEBUG({
1224 dbgs() << "Blocks created:\n\n";
1225 for (SIScheduleBlock *Block : CurrentBlocks)
1226 Block->printDebug(true);
1227 });
1228}
1229
1230// Two functions taken from Codegen/MachineScheduler.cpp
1231
1232/// Non-const version.
1233static MachineBasicBlock::iterator
1234nextIfDebug(MachineBasicBlock::iterator I,
1235 MachineBasicBlock::const_iterator End) {
1236 for (; I != End; ++I) {
1237 if (!I->isDebugInstr())
1238 break;
1239 }
1240 return I;
1241}
1242
1243void SIScheduleBlockCreator::topologicalSort() {
1244 unsigned DAGSize = CurrentBlocks.size();
1245 std::vector<int> WorkList;
1246
1247 LLVM_DEBUG(dbgs() << "Topological Sort\n");
1248
1249 WorkList.reserve(n: DAGSize);
1250 TopDownIndex2Block.resize(new_size: DAGSize);
1251 TopDownBlock2Index.resize(new_size: DAGSize);
1252 BottomUpIndex2Block.resize(new_size: DAGSize);
1253
1254 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1255 SIScheduleBlock *Block = CurrentBlocks[i];
1256 unsigned Degree = Block->getSuccs().size();
1257 TopDownBlock2Index[i] = Degree;
1258 if (Degree == 0) {
1259 WorkList.push_back(x: i);
1260 }
1261 }
1262
1263 int Id = DAGSize;
1264 while (!WorkList.empty()) {
1265 int i = WorkList.back();
1266 SIScheduleBlock *Block = CurrentBlocks[i];
1267 WorkList.pop_back();
1268 TopDownBlock2Index[i] = --Id;
1269 TopDownIndex2Block[Id] = i;
1270 for (SIScheduleBlock* Pred : Block->getPreds()) {
1271 if (!--TopDownBlock2Index[Pred->getID()])
1272 WorkList.push_back(x: Pred->getID());
1273 }
1274 }
1275
1276#ifndef NDEBUG
1277 // Check correctness of the ordering.
1278 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1279 SIScheduleBlock *Block = CurrentBlocks[i];
1280 for (SIScheduleBlock* Pred : Block->getPreds()) {
1281 assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&
1282 "Wrong Top Down topological sorting");
1283 }
1284 }
1285#endif
1286
1287 BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
1288 TopDownIndex2Block.rend());
1289}
1290
1291void SIScheduleBlockCreator::scheduleInsideBlocks() {
1292 unsigned DAGSize = CurrentBlocks.size();
1293
1294 LLVM_DEBUG(dbgs() << "\nScheduling Blocks\n\n");
1295
1296 // We do schedule a valid scheduling such that a Block corresponds
1297 // to a range of instructions.
1298 LLVM_DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
1299 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1300 SIScheduleBlock *Block = CurrentBlocks[i];
1301 Block->fastSchedule();
1302 }
1303
1304 // Note: the following code, and the part restoring previous position
1305 // is by far the most expensive operation of the Scheduler.
1306
1307 // Do not update CurrentTop.
1308 MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
1309 std::vector<MachineBasicBlock::iterator> PosOld;
1310 std::vector<MachineBasicBlock::iterator> PosNew;
1311 PosOld.reserve(n: DAG->SUnits.size());
1312 PosNew.reserve(n: DAG->SUnits.size());
1313
1314 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1315 int BlockIndice = TopDownIndex2Block[i];
1316 SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1317 std::vector<SUnit*> SUs = Block->getScheduledUnits();
1318
1319 for (SUnit* SU : SUs) {
1320 MachineInstr *MI = SU->getInstr();
1321 MachineBasicBlock::iterator Pos = MI;
1322 PosOld.push_back(x: Pos);
1323 if (&*CurrentTopFastSched == MI) {
1324 PosNew.push_back(x: Pos);
1325 CurrentTopFastSched = nextIfDebug(I: ++CurrentTopFastSched,
1326 End: DAG->getCurrentBottom());
1327 } else {
1328 // Update the instruction stream.
1329 DAG->getBB()->splice(Where: CurrentTopFastSched, Other: DAG->getBB(), From: MI);
1330
1331 // Update LiveIntervals.
1332 // Note: Moving all instructions and calling handleMove every time
1333 // is the most cpu intensive operation of the scheduler.
1334 // It would gain a lot if there was a way to recompute the
1335 // LiveIntervals for the entire scheduling region.
1336 DAG->getLIS()->handleMove(MI&: *MI, /*UpdateFlags=*/true);
1337 PosNew.push_back(x: CurrentTopFastSched);
1338 }
1339 }
1340 }
1341
1342 // Now we have Block of SUs == Block of MI.
1343 // We do the final schedule for the instructions inside the block.
1344 // The property that all the SUs of the Block are grouped together as MI
1345 // is used for correct reg usage tracking.
1346 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1347 SIScheduleBlock *Block = CurrentBlocks[i];
1348 std::vector<SUnit*> SUs = Block->getScheduledUnits();
1349 Block->schedule(BeginBlock: (*SUs.begin())->getInstr(), EndBlock: (*SUs.rbegin())->getInstr());
1350 }
1351
1352 LLVM_DEBUG(dbgs() << "Restoring MI Pos\n");
1353 // Restore old ordering (which prevents a LIS->handleMove bug).
1354 for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
1355 MachineBasicBlock::iterator POld = PosOld[i-1];
1356 MachineBasicBlock::iterator PNew = PosNew[i-1];
1357 if (PNew != POld) {
1358 // Update the instruction stream.
1359 DAG->getBB()->splice(Where: POld, Other: DAG->getBB(), From: PNew);
1360
1361 // Update LiveIntervals.
1362 DAG->getLIS()->handleMove(MI&: *POld, /*UpdateFlags=*/true);
1363 }
1364 }
1365
1366 LLVM_DEBUG({
1367 for (SIScheduleBlock *Block : CurrentBlocks)
1368 Block->printDebug(true);
1369 });
1370}
1371
1372void SIScheduleBlockCreator::fillStats() {
1373 unsigned DAGSize = CurrentBlocks.size();
1374
1375 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1376 int BlockIndice = TopDownIndex2Block[i];
1377 SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1378 if (Block->getPreds().empty())
1379 Block->Depth = 0;
1380 else {
1381 unsigned Depth = 0;
1382 for (SIScheduleBlock *Pred : Block->getPreds()) {
1383 if (Depth < Pred->Depth + Pred->getCost())
1384 Depth = Pred->Depth + Pred->getCost();
1385 }
1386 Block->Depth = Depth;
1387 }
1388 }
1389
1390 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1391 int BlockIndice = BottomUpIndex2Block[i];
1392 SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1393 if (Block->getSuccs().empty())
1394 Block->Height = 0;
1395 else {
1396 unsigned Height = 0;
1397 for (const auto &Succ : Block->getSuccs())
1398 Height = std::max(a: Height, b: Succ.first->Height + Succ.first->getCost());
1399 Block->Height = Height;
1400 }
1401 }
1402}
1403
1404// SIScheduleBlockScheduler //
1405
1406SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
1407 SISchedulerBlockSchedulerVariant Variant,
1408 SIScheduleBlocks BlocksStruct) :
1409 DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks),
1410 LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
1411 SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
1412
1413 // Fill the usage of every output
1414 // Warning: while by construction we always have a link between two blocks
1415 // when one needs a result from the other, the number of users of an output
1416 // is not the sum of child blocks having as input the same virtual register.
1417 // Here is an example. A produces x and y. B eats x and produces x'.
1418 // C eats x' and y. The register coalescer may have attributed the same
1419 // virtual register to x and x'.
1420 // To count accurately, we do a topological sort. In case the register is
1421 // found for several parents, we increment the usage of the one with the
1422 // highest topological index.
1423 LiveOutRegsNumUsages.resize(new_size: Blocks.size());
1424 for (SIScheduleBlock *Block : Blocks) {
1425 for (unsigned Reg : Block->getInRegs()) {
1426 bool Found = false;
1427 int topoInd = -1;
1428 for (SIScheduleBlock* Pred: Block->getPreds()) {
1429 std::set<unsigned> PredOutRegs = Pred->getOutRegs();
1430 std::set<unsigned>::iterator RegPos = PredOutRegs.find(x: Reg);
1431
1432 if (RegPos != PredOutRegs.end()) {
1433 Found = true;
1434 if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) {
1435 topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()];
1436 }
1437 }
1438 }
1439
1440 if (!Found)
1441 continue;
1442
1443 int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
1444 ++LiveOutRegsNumUsages[PredID][Reg];
1445 }
1446 }
1447
1448 LastPosHighLatencyParentScheduled.resize(new_size: Blocks.size(), x: 0);
1449 BlockNumPredsLeft.resize(new_size: Blocks.size());
1450 BlockNumSuccsLeft.resize(new_size: Blocks.size());
1451
1452 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1453 SIScheduleBlock *Block = Blocks[i];
1454 BlockNumPredsLeft[i] = Block->getPreds().size();
1455 BlockNumSuccsLeft[i] = Block->getSuccs().size();
1456 }
1457
1458#ifndef NDEBUG
1459 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1460 SIScheduleBlock *Block = Blocks[i];
1461 assert(Block->getID() == i);
1462 }
1463#endif
1464
1465 std::set<unsigned> InRegs = DAG->getInRegs();
1466 addLiveRegs(Regs&: InRegs);
1467
1468 // Increase LiveOutRegsNumUsages for blocks
1469 // producing registers consumed in another
1470 // scheduling region.
1471 for (unsigned Reg : DAG->getOutRegs()) {
1472 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1473 // Do reverse traversal
1474 int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i];
1475 SIScheduleBlock *Block = Blocks[ID];
1476 const std::set<unsigned> &OutRegs = Block->getOutRegs();
1477
1478 if (OutRegs.find(x: Reg) == OutRegs.end())
1479 continue;
1480
1481 ++LiveOutRegsNumUsages[ID][Reg];
1482 break;
1483 }
1484 }
1485
1486 // Fill LiveRegsConsumers for regs that were already
1487 // defined before scheduling.
1488 for (SIScheduleBlock *Block : Blocks) {
1489 for (unsigned Reg : Block->getInRegs()) {
1490 bool Found = false;
1491 for (SIScheduleBlock* Pred: Block->getPreds()) {
1492 std::set<unsigned> PredOutRegs = Pred->getOutRegs();
1493 std::set<unsigned>::iterator RegPos = PredOutRegs.find(x: Reg);
1494
1495 if (RegPos != PredOutRegs.end()) {
1496 Found = true;
1497 break;
1498 }
1499 }
1500
1501 if (!Found)
1502 ++LiveRegsConsumers[Reg];
1503 }
1504 }
1505
1506 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1507 SIScheduleBlock *Block = Blocks[i];
1508 if (BlockNumPredsLeft[i] == 0) {
1509 ReadyBlocks.push_back(x: Block);
1510 }
1511 }
1512
1513 while (SIScheduleBlock *Block = pickBlock()) {
1514 BlocksScheduled.push_back(x: Block);
1515 blockScheduled(Block);
1516 }
1517
1518 LLVM_DEBUG(dbgs() << "Block Order:"; for (SIScheduleBlock *Block
1519 : BlocksScheduled) {
1520 dbgs() << ' ' << Block->getID();
1521 } dbgs() << '\n';);
1522}
1523
1524bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
1525 SIBlockSchedCandidate &TryCand) {
1526 if (!Cand.isValid()) {
1527 TryCand.Reason = NodeOrder;
1528 return true;
1529 }
1530
1531 // Try to hide high latencies.
1532 if (SISched::tryLess(TryVal: TryCand.LastPosHighLatParentScheduled,
1533 CandVal: Cand.LastPosHighLatParentScheduled, TryCand, Cand, Reason: Latency))
1534 return true;
1535 // Schedule high latencies early so you can hide them better.
1536 if (SISched::tryGreater(TryVal: TryCand.IsHighLatency, CandVal: Cand.IsHighLatency,
1537 TryCand, Cand, Reason: Latency))
1538 return true;
1539 if (TryCand.IsHighLatency && SISched::tryGreater(TryVal: TryCand.Height, CandVal: Cand.Height,
1540 TryCand, Cand, Reason: Depth))
1541 return true;
1542 if (SISched::tryGreater(TryVal: TryCand.NumHighLatencySuccessors,
1543 CandVal: Cand.NumHighLatencySuccessors,
1544 TryCand, Cand, Reason: Successor))
1545 return true;
1546 return false;
1547}
1548
1549bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
1550 SIBlockSchedCandidate &TryCand) {
1551 if (!Cand.isValid()) {
1552 TryCand.Reason = NodeOrder;
1553 return true;
1554 }
1555
1556 if (SISched::tryLess(TryVal: TryCand.VGPRUsageDiff > 0, CandVal: Cand.VGPRUsageDiff > 0,
1557 TryCand, Cand, Reason: RegUsage))
1558 return true;
1559 if (SISched::tryGreater(TryVal: TryCand.NumSuccessors > 0,
1560 CandVal: Cand.NumSuccessors > 0,
1561 TryCand, Cand, Reason: Successor))
1562 return true;
1563 if (SISched::tryGreater(TryVal: TryCand.Height, CandVal: Cand.Height, TryCand, Cand, Reason: Depth))
1564 return true;
1565 if (SISched::tryLess(TryVal: TryCand.VGPRUsageDiff, CandVal: Cand.VGPRUsageDiff,
1566 TryCand, Cand, Reason: RegUsage))
1567 return true;
1568 return false;
1569}
1570
1571SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
1572 SIBlockSchedCandidate Cand;
1573 std::vector<SIScheduleBlock*>::iterator Best;
1574 SIScheduleBlock *Block;
1575 if (ReadyBlocks.empty())
1576 return nullptr;
1577
1578 DAG->fillVgprSgprCost(First: LiveRegs.begin(), End: LiveRegs.end(),
1579 VgprUsage&: VregCurrentUsage, SgprUsage&: SregCurrentUsage);
1580 if (VregCurrentUsage > maxVregUsage)
1581 maxVregUsage = VregCurrentUsage;
1582 if (SregCurrentUsage > maxSregUsage)
1583 maxSregUsage = SregCurrentUsage;
1584 LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: ";
1585 for (SIScheduleBlock *Block
1586 : ReadyBlocks) dbgs()
1587 << Block->getID() << ' ';
1588 dbgs() << "\nCurrent Live:\n";
1589 for (unsigned Reg
1590 : LiveRegs) dbgs()
1591 << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
1592 dbgs() << '\n';
1593 dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
1594 dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';);
1595
1596 Cand.Block = nullptr;
1597 for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
1598 E = ReadyBlocks.end(); I != E; ++I) {
1599 SIBlockSchedCandidate TryCand;
1600 TryCand.Block = *I;
1601 TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
1602 TryCand.VGPRUsageDiff =
1603 checkRegUsageImpact(TryCand.Block->getInRegs(),
1604 TryCand.Block->getOutRegs())[AMDGPU::RegisterPressureSets::VGPR_32];
1605 TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
1606 TryCand.NumHighLatencySuccessors =
1607 TryCand.Block->getNumHighLatencySuccessors();
1608 TryCand.LastPosHighLatParentScheduled =
1609 (unsigned int) std::max<int> (a: 0,
1610 b: LastPosHighLatencyParentScheduled[TryCand.Block->getID()] -
1611 LastPosWaitedHighLatency);
1612 TryCand.Height = TryCand.Block->Height;
1613 // Try not to increase VGPR usage too much, else we may spill.
1614 if (VregCurrentUsage > 120 ||
1615 Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) {
1616 if (!tryCandidateRegUsage(Cand, TryCand) &&
1617 Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage)
1618 tryCandidateLatency(Cand, TryCand);
1619 } else {
1620 if (!tryCandidateLatency(Cand, TryCand))
1621 tryCandidateRegUsage(Cand, TryCand);
1622 }
1623 if (TryCand.Reason != NoCand) {
1624 Cand.setBest(TryCand);
1625 Best = I;
1626 LLVM_DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
1627 << getReasonStr(Cand.Reason) << '\n');
1628 }
1629 }
1630
1631 LLVM_DEBUG(dbgs() << "Picking: " << Cand.Block->getID() << '\n';
1632 dbgs() << "Is a block with high latency instruction: "
1633 << (Cand.IsHighLatency ? "yes\n" : "no\n");
1634 dbgs() << "Position of last high latency dependency: "
1635 << Cand.LastPosHighLatParentScheduled << '\n';
1636 dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
1637 dbgs() << '\n';);
1638
1639 Block = Cand.Block;
1640 ReadyBlocks.erase(position: Best);
1641 return Block;
1642}
1643
1644// Tracking of currently alive registers to determine VGPR Usage.
1645
1646void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
1647 for (Register Reg : Regs) {
1648 // For now only track virtual registers.
1649 if (!Reg.isVirtual())
1650 continue;
1651 // If not already in the live set, then add it.
1652 (void) LiveRegs.insert(x: Reg);
1653 }
1654}
1655
1656void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
1657 std::set<unsigned> &Regs) {
1658 for (unsigned Reg : Regs) {
1659 // For now only track virtual registers.
1660 std::set<unsigned>::iterator Pos = LiveRegs.find(x: Reg);
1661 assert (Pos != LiveRegs.end() && // Reg must be live.
1662 LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
1663 LiveRegsConsumers[Reg] >= 1);
1664 --LiveRegsConsumers[Reg];
1665 if (LiveRegsConsumers[Reg] == 0)
1666 LiveRegs.erase(position: Pos);
1667 }
1668}
1669
1670void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
1671 for (const auto &Block : Parent->getSuccs()) {
1672 if (--BlockNumPredsLeft[Block.first->getID()] == 0)
1673 ReadyBlocks.push_back(x: Block.first);
1674
1675 if (Parent->isHighLatencyBlock() &&
1676 Block.second == SIScheduleBlockLinkKind::Data)
1677 LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled;
1678 }
1679}
1680
1681void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
1682 decreaseLiveRegs(Block, Regs&: Block->getInRegs());
1683 addLiveRegs(Regs&: Block->getOutRegs());
1684 releaseBlockSuccs(Parent: Block);
1685 for (const auto &RegP : LiveOutRegsNumUsages[Block->getID()]) {
1686 // We produce this register, thus it must not be previously alive.
1687 assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() ||
1688 LiveRegsConsumers[RegP.first] == 0);
1689 LiveRegsConsumers[RegP.first] += RegP.second;
1690 }
1691 if (LastPosHighLatencyParentScheduled[Block->getID()] >
1692 (unsigned)LastPosWaitedHighLatency)
1693 LastPosWaitedHighLatency =
1694 LastPosHighLatencyParentScheduled[Block->getID()];
1695 ++NumBlockScheduled;
1696}
1697
1698std::vector<int>
1699SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
1700 std::set<unsigned> &OutRegs) {
1701 std::vector<int> DiffSetPressure;
1702 DiffSetPressure.assign(n: DAG->getTRI()->getNumRegPressureSets(), val: 0);
1703
1704 for (Register Reg : InRegs) {
1705 // For now only track virtual registers.
1706 if (!Reg.isVirtual())
1707 continue;
1708 if (LiveRegsConsumers[Reg] > 1)
1709 continue;
1710 PSetIterator PSetI = DAG->getMRI()->getPressureSets(RegUnit: Reg);
1711 for (; PSetI.isValid(); ++PSetI) {
1712 DiffSetPressure[*PSetI] -= PSetI.getWeight();
1713 }
1714 }
1715
1716 for (Register Reg : OutRegs) {
1717 // For now only track virtual registers.
1718 if (!Reg.isVirtual())
1719 continue;
1720 PSetIterator PSetI = DAG->getMRI()->getPressureSets(RegUnit: Reg);
1721 for (; PSetI.isValid(); ++PSetI) {
1722 DiffSetPressure[*PSetI] += PSetI.getWeight();
1723 }
1724 }
1725
1726 return DiffSetPressure;
1727}
1728
1729// SIScheduler //
1730
1731struct SIScheduleBlockResult
1732SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
1733 SISchedulerBlockSchedulerVariant ScheduleVariant) {
1734 SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
1735 SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
1736 std::vector<SIScheduleBlock*> ScheduledBlocks;
1737 struct SIScheduleBlockResult Res;
1738
1739 ScheduledBlocks = Scheduler.getBlocks();
1740
1741 for (SIScheduleBlock *Block : ScheduledBlocks) {
1742 std::vector<SUnit*> SUs = Block->getScheduledUnits();
1743
1744 for (SUnit* SU : SUs)
1745 Res.SUs.push_back(x: SU->NodeNum);
1746 }
1747
1748 Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
1749 Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
1750 return Res;
1751}
1752
1753// SIScheduleDAGMI //
1754
1755SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
1756 ScheduleDAGMILive(C, std::make_unique<GenericScheduler>(args&: C)) {
1757 SITII = static_cast<const SIInstrInfo*>(TII);
1758 SITRI = static_cast<const SIRegisterInfo*>(TRI);
1759}
1760
1761SIScheduleDAGMI::~SIScheduleDAGMI() = default;
1762
1763// Code adapted from scheduleDAG.cpp
1764// Does a topological sort over the SUs.
1765// Both TopDown and BottomUp
1766void SIScheduleDAGMI::topologicalSort() {
1767 Topo.InitDAGTopologicalSorting();
1768
1769 TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end());
1770 BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend());
1771}
1772
1773// Move low latencies further from their user without
1774// increasing SGPR usage (in general)
1775// This is to be replaced by a better pass that would
1776// take into account SGPR usage (based on VGPR Usage
1777// and the corresponding wavefront count), that would
1778// try to merge groups of loads if it make sense, etc
1779void SIScheduleDAGMI::moveLowLatencies() {
1780 unsigned DAGSize = SUnits.size();
1781 int LastLowLatencyUser = -1;
1782 int LastLowLatencyPos = -1;
1783
1784 for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) {
1785 SUnit *SU = &SUnits[ScheduledSUnits[i]];
1786 bool IsLowLatencyUser = false;
1787 unsigned MinPos = 0;
1788
1789 for (SDep& PredDep : SU->Preds) {
1790 SUnit *Pred = PredDep.getSUnit();
1791 if (SITII->isLowLatencyInstruction(MI: *Pred->getInstr())) {
1792 IsLowLatencyUser = true;
1793 }
1794 if (Pred->NodeNum >= DAGSize)
1795 continue;
1796 unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum];
1797 if (PredPos >= MinPos)
1798 MinPos = PredPos + 1;
1799 }
1800
1801 if (SITII->isLowLatencyInstruction(MI: *SU->getInstr())) {
1802 unsigned BestPos = LastLowLatencyUser + 1;
1803 if ((int)BestPos <= LastLowLatencyPos)
1804 BestPos = LastLowLatencyPos + 1;
1805 if (BestPos < MinPos)
1806 BestPos = MinPos;
1807 if (BestPos < i) {
1808 for (unsigned u = i; u > BestPos; --u) {
1809 ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
1810 ScheduledSUnits[u] = ScheduledSUnits[u-1];
1811 }
1812 ScheduledSUnits[BestPos] = SU->NodeNum;
1813 ScheduledSUnitsInv[SU->NodeNum] = BestPos;
1814 }
1815 LastLowLatencyPos = BestPos;
1816 if (IsLowLatencyUser)
1817 LastLowLatencyUser = BestPos;
1818 } else if (IsLowLatencyUser) {
1819 LastLowLatencyUser = i;
1820 // Moves COPY instructions on which depends
1821 // the low latency instructions too.
1822 } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
1823 bool CopyForLowLat = false;
1824 for (SDep& SuccDep : SU->Succs) {
1825 SUnit *Succ = SuccDep.getSUnit();
1826 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1827 continue;
1828 if (SITII->isLowLatencyInstruction(MI: *Succ->getInstr())) {
1829 CopyForLowLat = true;
1830 }
1831 }
1832 if (!CopyForLowLat)
1833 continue;
1834 if (MinPos < i) {
1835 for (unsigned u = i; u > MinPos; --u) {
1836 ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
1837 ScheduledSUnits[u] = ScheduledSUnits[u-1];
1838 }
1839 ScheduledSUnits[MinPos] = SU->NodeNum;
1840 ScheduledSUnitsInv[SU->NodeNum] = MinPos;
1841 }
1842 }
1843 }
1844}
1845
1846void SIScheduleDAGMI::restoreSULinksLeft() {
1847 for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
1848 SUnits[i].isScheduled = false;
1849 SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft;
1850 SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft;
1851 SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft;
1852 SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft;
1853 }
1854}
1855
1856// Return the Vgpr and Sgpr usage corresponding to some virtual registers.
1857template<typename _Iterator> void
1858SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
1859 unsigned &VgprUsage, unsigned &SgprUsage) {
1860 VgprUsage = 0;
1861 SgprUsage = 0;
1862 for (_Iterator RegI = First; RegI != End; ++RegI) {
1863 Register Reg = *RegI;
1864 // For now only track virtual registers
1865 if (!Reg.isVirtual())
1866 continue;
1867 PSetIterator PSetI = MRI.getPressureSets(RegUnit: Reg);
1868 for (; PSetI.isValid(); ++PSetI) {
1869 if (*PSetI == AMDGPU::RegisterPressureSets::VGPR_32)
1870 VgprUsage += PSetI.getWeight();
1871 else if (*PSetI == AMDGPU::RegisterPressureSets::SReg_32)
1872 SgprUsage += PSetI.getWeight();
1873 }
1874 }
1875}
1876
1877void SIScheduleDAGMI::schedule()
1878{
1879 SmallVector<SUnit*, 8> TopRoots, BotRoots;
1880 SIScheduleBlockResult Best, Temp;
1881 LLVM_DEBUG(dbgs() << "Preparing Scheduling\n");
1882
1883 buildDAGWithRegPressure();
1884 postProcessDAG();
1885
1886 LLVM_DEBUG(dump());
1887 if (PrintDAGs)
1888 dump();
1889 if (ViewMISchedDAGs)
1890 viewGraph();
1891
1892 topologicalSort();
1893 findRootsAndBiasEdges(TopRoots, BotRoots);
1894 // We reuse several ScheduleDAGMI and ScheduleDAGMILive
1895 // functions, but to make them happy we must initialize
1896 // the default Scheduler implementation (even if we do not
1897 // run it)
1898 SchedImpl->initialize(DAG: this);
1899 initQueues(TopRoots, BotRoots);
1900
1901 // Fill some stats to help scheduling.
1902
1903 SUnitsLinksBackup = SUnits;
1904 IsLowLatencySU.clear();
1905 LowLatencyOffset.clear();
1906 IsHighLatencySU.clear();
1907
1908 IsLowLatencySU.resize(new_size: SUnits.size(), x: 0);
1909 LowLatencyOffset.resize(new_size: SUnits.size(), x: 0);
1910 IsHighLatencySU.resize(new_size: SUnits.size(), x: 0);
1911
1912 for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
1913 SUnit *SU = &SUnits[i];
1914 const MachineOperand *BaseLatOp;
1915 int64_t OffLatReg;
1916 if (SITII->isLowLatencyInstruction(MI: *SU->getInstr())) {
1917 IsLowLatencySU[i] = 1;
1918 bool OffsetIsScalable;
1919 if (SITII->getMemOperandWithOffset(*SU->getInstr(), BaseLatOp, OffLatReg,
1920 OffsetIsScalable, TRI))
1921 LowLatencyOffset[i] = OffLatReg;
1922 } else if (SITII->isHighLatencyDef(Opc: SU->getInstr()->getOpcode()))
1923 IsHighLatencySU[i] = 1;
1924 }
1925
1926 SIScheduler Scheduler(this);
1927 Best = Scheduler.scheduleVariant(BlockVariant: SISchedulerBlockCreatorVariant::LatenciesAlone,
1928 ScheduleVariant: SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
1929
1930 // if VGPR usage is extremely high, try other good performing variants
1931 // which could lead to lower VGPR usage
1932 if (Best.MaxVGPRUsage > 180) {
1933 static const std::pair<SISchedulerBlockCreatorVariant,
1934 SISchedulerBlockSchedulerVariant>
1935 Variants[] = {
1936 { LatenciesAlone, BlockRegUsageLatency },
1937// { LatenciesAlone, BlockRegUsage },
1938 { LatenciesGrouped, BlockLatencyRegUsage },
1939// { LatenciesGrouped, BlockRegUsageLatency },
1940// { LatenciesGrouped, BlockRegUsage },
1941 { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1942// { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1943// { LatenciesAlonePlusConsecutive, BlockRegUsage }
1944 };
1945 for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
1946 Temp = Scheduler.scheduleVariant(BlockVariant: v.first, ScheduleVariant: v.second);
1947 if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
1948 Best = Temp;
1949 }
1950 }
1951 // if VGPR usage is still extremely high, we may spill. Try other variants
1952 // which are less performing, but that could lead to lower VGPR usage.
1953 if (Best.MaxVGPRUsage > 200) {
1954 static const std::pair<SISchedulerBlockCreatorVariant,
1955 SISchedulerBlockSchedulerVariant>
1956 Variants[] = {
1957// { LatenciesAlone, BlockRegUsageLatency },
1958 { LatenciesAlone, BlockRegUsage },
1959// { LatenciesGrouped, BlockLatencyRegUsage },
1960 { LatenciesGrouped, BlockRegUsageLatency },
1961 { LatenciesGrouped, BlockRegUsage },
1962// { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1963 { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1964 { LatenciesAlonePlusConsecutive, BlockRegUsage }
1965 };
1966 for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
1967 Temp = Scheduler.scheduleVariant(BlockVariant: v.first, ScheduleVariant: v.second);
1968 if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
1969 Best = Temp;
1970 }
1971 }
1972
1973 ScheduledSUnits = Best.SUs;
1974 ScheduledSUnitsInv.resize(new_size: SUnits.size());
1975
1976 for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
1977 ScheduledSUnitsInv[ScheduledSUnits[i]] = i;
1978 }
1979
1980 moveLowLatencies();
1981
1982 // Tell the outside world about the result of the scheduling.
1983
1984 assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
1985 TopRPTracker.setPos(CurrentTop);
1986
1987 for (unsigned I : ScheduledSUnits) {
1988 SUnit *SU = &SUnits[I];
1989
1990 scheduleMI(SU, IsTopNode: true);
1991
1992 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
1993 << *SU->getInstr());
1994 }
1995
1996 assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
1997
1998 placeDebugValues();
1999
2000 LLVM_DEBUG({
2001 dbgs() << "*** Final schedule for "
2002 << printMBBReference(*begin()->getParent()) << " ***\n";
2003 dumpSchedule();
2004 dbgs() << '\n';
2005 });
2006}
2007

source code of llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp