1 | //===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// This contains a MachineSchedStrategy implementation for maximizing wave |
11 | /// occupancy on GCN hardware. |
12 | /// |
13 | /// This pass will apply multiple scheduling stages to the same function. |
14 | /// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual |
15 | /// entry point for the scheduling of those regions is |
16 | /// GCNScheduleDAGMILive::runSchedStages. |
17 | |
18 | /// Generally, the reason for having multiple scheduling stages is to account |
19 | /// for the kernel-wide effect of register usage on occupancy. Usually, only a |
20 | /// few scheduling regions will have register pressure high enough to limit |
21 | /// occupancy for the kernel, so constraints can be relaxed to improve ILP in |
22 | /// other regions. |
23 | /// |
24 | //===----------------------------------------------------------------------===// |
25 | |
26 | #include "GCNSchedStrategy.h" |
27 | #include "AMDGPUIGroupLP.h" |
28 | #include "SIMachineFunctionInfo.h" |
29 | #include "llvm/CodeGen/RegisterClassInfo.h" |
30 | |
31 | #define DEBUG_TYPE "machine-scheduler" |
32 | |
33 | using namespace llvm; |
34 | |
35 | static cl::opt<bool> DisableUnclusterHighRP( |
36 | "amdgpu-disable-unclustered-high-rp-reschedule" , cl::Hidden, |
37 | cl::desc("Disable unclustered high register pressure " |
38 | "reduction scheduling stage." ), |
39 | cl::init(Val: false)); |
40 | |
41 | static cl::opt<bool> DisableClusteredLowOccupancy( |
42 | "amdgpu-disable-clustered-low-occupancy-reschedule" , cl::Hidden, |
43 | cl::desc("Disable clustered low occupancy " |
44 | "rescheduling for ILP scheduling stage." ), |
45 | cl::init(Val: false)); |
46 | |
47 | static cl::opt<unsigned> ScheduleMetricBias( |
48 | "amdgpu-schedule-metric-bias" , cl::Hidden, |
49 | cl::desc( |
50 | "Sets the bias which adds weight to occupancy vs latency. Set it to " |
51 | "100 to chase the occupancy only." ), |
52 | cl::init(Val: 10)); |
53 | |
54 | static cl::opt<bool> |
55 | RelaxedOcc("amdgpu-schedule-relaxed-occupancy" , cl::Hidden, |
56 | cl::desc("Relax occupancy targets for kernels which are memory " |
57 | "bound (amdgpu-membound-threshold), or " |
58 | "Wave Limited (amdgpu-limit-wave-threshold)." ), |
59 | cl::init(Val: false)); |
60 | |
61 | const unsigned ScheduleMetrics::ScaleFactor = 100; |
62 | |
63 | GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C) |
64 | : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), |
65 | HasHighPressure(false) {} |
66 | |
67 | void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) { |
68 | GenericScheduler::initialize(dag: DAG); |
69 | |
70 | MF = &DAG->MF; |
71 | |
72 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
73 | |
74 | SGPRExcessLimit = |
75 | Context->RegClassInfo->getNumAllocatableRegs(RC: &AMDGPU::SGPR_32RegClass); |
76 | VGPRExcessLimit = |
77 | Context->RegClassInfo->getNumAllocatableRegs(RC: &AMDGPU::VGPR_32RegClass); |
78 | |
79 | SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); |
80 | // Set the initial TargetOccupnacy to the maximum occupancy that we can |
81 | // achieve for this function. This effectively sets a lower bound on the |
82 | // 'Critical' register limits in the scheduler. |
83 | // Allow for lower occupancy targets if kernel is wave limited or memory |
84 | // bound, and using the relaxed occupancy feature. |
85 | TargetOccupancy = |
86 | RelaxedOcc ? MFI.getMinAllowedOccupancy() : MFI.getOccupancy(); |
87 | SGPRCriticalLimit = |
88 | std::min(a: ST.getMaxNumSGPRs(WavesPerEU: TargetOccupancy, Addressable: true), b: SGPRExcessLimit); |
89 | |
90 | if (!KnownExcessRP) { |
91 | VGPRCriticalLimit = |
92 | std::min(a: ST.getMaxNumVGPRs(WavesPerEU: TargetOccupancy), b: VGPRExcessLimit); |
93 | } else { |
94 | // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except |
95 | // returns a reasonably small number for targets with lots of VGPRs, such |
96 | // as GFX10 and GFX11. |
97 | LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative " |
98 | "VGPRCriticalLimit calculation method.\n" ); |
99 | |
100 | unsigned Granule = AMDGPU::IsaInfo::getVGPRAllocGranule(&ST); |
101 | unsigned Addressable = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST); |
102 | unsigned VGPRBudget = alignDown(Value: Addressable / TargetOccupancy, Align: Granule); |
103 | VGPRBudget = std::max(a: VGPRBudget, b: Granule); |
104 | VGPRCriticalLimit = std::min(a: VGPRBudget, b: VGPRExcessLimit); |
105 | } |
106 | |
107 | // Subtract error margin and bias from register limits and avoid overflow. |
108 | SGPRCriticalLimit -= std::min(a: SGPRLimitBias + ErrorMargin, b: SGPRCriticalLimit); |
109 | VGPRCriticalLimit -= std::min(a: VGPRLimitBias + ErrorMargin, b: VGPRCriticalLimit); |
110 | SGPRExcessLimit -= std::min(a: SGPRLimitBias + ErrorMargin, b: SGPRExcessLimit); |
111 | VGPRExcessLimit -= std::min(a: VGPRLimitBias + ErrorMargin, b: VGPRExcessLimit); |
112 | |
113 | LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit |
114 | << ", VGPRExcessLimit = " << VGPRExcessLimit |
115 | << ", SGPRCriticalLimit = " << SGPRCriticalLimit |
116 | << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n" ); |
117 | } |
118 | |
119 | void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, |
120 | bool AtTop, |
121 | const RegPressureTracker &RPTracker, |
122 | const SIRegisterInfo *SRI, |
123 | unsigned SGPRPressure, |
124 | unsigned VGPRPressure) { |
125 | Cand.SU = SU; |
126 | Cand.AtTop = AtTop; |
127 | |
128 | if (!DAG->isTrackingPressure()) |
129 | return; |
130 | |
131 | // getDownwardPressure() and getUpwardPressure() make temporary changes to |
132 | // the tracker, so we need to pass those function a non-const copy. |
133 | RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker); |
134 | |
135 | Pressure.clear(); |
136 | MaxPressure.clear(); |
137 | |
138 | if (AtTop) |
139 | TempTracker.getDownwardPressure(MI: SU->getInstr(), PressureResult&: Pressure, MaxPressureResult&: MaxPressure); |
140 | else { |
141 | // FIXME: I think for bottom up scheduling, the register pressure is cached |
142 | // and can be retrieved by DAG->getPressureDif(SU). |
143 | TempTracker.getUpwardPressure(MI: SU->getInstr(), PressureResult&: Pressure, MaxPressureResult&: MaxPressure); |
144 | } |
145 | |
146 | unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; |
147 | unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; |
148 | |
149 | // If two instructions increase the pressure of different register sets |
150 | // by the same amount, the generic scheduler will prefer to schedule the |
151 | // instruction that increases the set with the least amount of registers, |
152 | // which in our case would be SGPRs. This is rarely what we want, so |
153 | // when we report excess/critical register pressure, we do it either |
154 | // only for VGPRs or only for SGPRs. |
155 | |
156 | // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs. |
157 | const unsigned MaxVGPRPressureInc = 16; |
158 | bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit; |
159 | bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit; |
160 | |
161 | |
162 | // FIXME: We have to enter REG-EXCESS before we reach the actual threshold |
163 | // to increase the likelihood we don't go over the limits. We should improve |
164 | // the analysis to look through dependencies to find the path with the least |
165 | // register pressure. |
166 | |
167 | // We only need to update the RPDelta for instructions that increase register |
168 | // pressure. Instructions that decrease or keep reg pressure the same will be |
169 | // marked as RegExcess in tryCandidate() when they are compared with |
170 | // instructions that increase the register pressure. |
171 | if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) { |
172 | HasHighPressure = true; |
173 | Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); |
174 | Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit); |
175 | } |
176 | |
177 | if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) { |
178 | HasHighPressure = true; |
179 | Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32); |
180 | Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit); |
181 | } |
182 | |
183 | // Register pressure is considered 'CRITICAL' if it is approaching a value |
184 | // that would reduce the wave occupancy for the execution unit. When |
185 | // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both |
186 | // has the same cost, so we don't need to prefer one over the other. |
187 | |
188 | int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit; |
189 | int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit; |
190 | |
191 | if (SGPRDelta >= 0 || VGPRDelta >= 0) { |
192 | HasHighPressure = true; |
193 | if (SGPRDelta > VGPRDelta) { |
194 | Cand.RPDelta.CriticalMax = |
195 | PressureChange(AMDGPU::RegisterPressureSets::SReg_32); |
196 | Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta); |
197 | } else { |
198 | Cand.RPDelta.CriticalMax = |
199 | PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); |
200 | Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta); |
201 | } |
202 | } |
203 | } |
204 | |
205 | // This function is mostly cut and pasted from |
206 | // GenericScheduler::pickNodeFromQueue() |
207 | void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, |
208 | const CandPolicy &ZonePolicy, |
209 | const RegPressureTracker &RPTracker, |
210 | SchedCandidate &Cand) { |
211 | const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI); |
212 | ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos(); |
213 | unsigned SGPRPressure = 0; |
214 | unsigned VGPRPressure = 0; |
215 | if (DAG->isTrackingPressure()) { |
216 | SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; |
217 | VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; |
218 | } |
219 | ReadyQueue &Q = Zone.Available; |
220 | for (SUnit *SU : Q) { |
221 | |
222 | SchedCandidate TryCand(ZonePolicy); |
223 | initCandidate(Cand&: TryCand, SU, AtTop: Zone.isTop(), RPTracker, SRI, |
224 | SGPRPressure, VGPRPressure); |
225 | // Pass SchedBoundary only when comparing nodes from the same boundary. |
226 | SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; |
227 | tryCandidate(Cand, TryCand, Zone: ZoneArg); |
228 | if (TryCand.Reason != NoCand) { |
229 | // Initialize resource delta if needed in case future heuristics query it. |
230 | if (TryCand.ResDelta == SchedResourceDelta()) |
231 | TryCand.initResourceDelta(DAG: Zone.DAG, SchedModel); |
232 | Cand.setBest(TryCand); |
233 | LLVM_DEBUG(traceCandidate(Cand)); |
234 | } |
235 | } |
236 | } |
237 | |
238 | // This function is mostly cut and pasted from |
239 | // GenericScheduler::pickNodeBidirectional() |
240 | SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) { |
241 | // Schedule as far as possible in the direction of no choice. This is most |
242 | // efficient, but also provides the best heuristics for CriticalPSets. |
243 | if (SUnit *SU = Bot.pickOnlyChoice()) { |
244 | IsTopNode = false; |
245 | return SU; |
246 | } |
247 | if (SUnit *SU = Top.pickOnlyChoice()) { |
248 | IsTopNode = true; |
249 | return SU; |
250 | } |
251 | // Set the bottom-up policy based on the state of the current bottom zone and |
252 | // the instructions outside the zone, including the top zone. |
253 | CandPolicy BotPolicy; |
254 | setPolicy(Policy&: BotPolicy, /*IsPostRA=*/false, CurrZone&: Bot, OtherZone: &Top); |
255 | // Set the top-down policy based on the state of the current top zone and |
256 | // the instructions outside the zone, including the bottom zone. |
257 | CandPolicy TopPolicy; |
258 | setPolicy(Policy&: TopPolicy, /*IsPostRA=*/false, CurrZone&: Top, OtherZone: &Bot); |
259 | |
260 | // See if BotCand is still valid (because we previously scheduled from Top). |
261 | LLVM_DEBUG(dbgs() << "Picking from Bot:\n" ); |
262 | if (!BotCand.isValid() || BotCand.SU->isScheduled || |
263 | BotCand.Policy != BotPolicy) { |
264 | BotCand.reset(NewPolicy: CandPolicy()); |
265 | pickNodeFromQueue(Zone&: Bot, ZonePolicy: BotPolicy, RPTracker: DAG->getBotRPTracker(), Cand&: BotCand); |
266 | assert(BotCand.Reason != NoCand && "failed to find the first candidate" ); |
267 | } else { |
268 | LLVM_DEBUG(traceCandidate(BotCand)); |
269 | #ifndef NDEBUG |
270 | if (VerifyScheduling) { |
271 | SchedCandidate TCand; |
272 | TCand.reset(NewPolicy: CandPolicy()); |
273 | pickNodeFromQueue(Zone&: Bot, ZonePolicy: BotPolicy, RPTracker: DAG->getBotRPTracker(), Cand&: TCand); |
274 | assert(TCand.SU == BotCand.SU && |
275 | "Last pick result should correspond to re-picking right now" ); |
276 | } |
277 | #endif |
278 | } |
279 | |
280 | // Check if the top Q has a better candidate. |
281 | LLVM_DEBUG(dbgs() << "Picking from Top:\n" ); |
282 | if (!TopCand.isValid() || TopCand.SU->isScheduled || |
283 | TopCand.Policy != TopPolicy) { |
284 | TopCand.reset(NewPolicy: CandPolicy()); |
285 | pickNodeFromQueue(Zone&: Top, ZonePolicy: TopPolicy, RPTracker: DAG->getTopRPTracker(), Cand&: TopCand); |
286 | assert(TopCand.Reason != NoCand && "failed to find the first candidate" ); |
287 | } else { |
288 | LLVM_DEBUG(traceCandidate(TopCand)); |
289 | #ifndef NDEBUG |
290 | if (VerifyScheduling) { |
291 | SchedCandidate TCand; |
292 | TCand.reset(NewPolicy: CandPolicy()); |
293 | pickNodeFromQueue(Zone&: Top, ZonePolicy: TopPolicy, RPTracker: DAG->getTopRPTracker(), Cand&: TCand); |
294 | assert(TCand.SU == TopCand.SU && |
295 | "Last pick result should correspond to re-picking right now" ); |
296 | } |
297 | #endif |
298 | } |
299 | |
300 | // Pick best from BotCand and TopCand. |
301 | LLVM_DEBUG(dbgs() << "Top Cand: " ; traceCandidate(TopCand); |
302 | dbgs() << "Bot Cand: " ; traceCandidate(BotCand);); |
303 | SchedCandidate Cand = BotCand; |
304 | TopCand.Reason = NoCand; |
305 | tryCandidate(Cand, TryCand&: TopCand, Zone: nullptr); |
306 | if (TopCand.Reason != NoCand) { |
307 | Cand.setBest(TopCand); |
308 | } |
309 | LLVM_DEBUG(dbgs() << "Picking: " ; traceCandidate(Cand);); |
310 | |
311 | IsTopNode = Cand.AtTop; |
312 | return Cand.SU; |
313 | } |
314 | |
315 | // This function is mostly cut and pasted from |
316 | // GenericScheduler::pickNode() |
317 | SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) { |
318 | if (DAG->top() == DAG->bottom()) { |
319 | assert(Top.Available.empty() && Top.Pending.empty() && |
320 | Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage" ); |
321 | return nullptr; |
322 | } |
323 | SUnit *SU; |
324 | do { |
325 | if (RegionPolicy.OnlyTopDown) { |
326 | SU = Top.pickOnlyChoice(); |
327 | if (!SU) { |
328 | CandPolicy NoPolicy; |
329 | TopCand.reset(NewPolicy: NoPolicy); |
330 | pickNodeFromQueue(Zone&: Top, ZonePolicy: NoPolicy, RPTracker: DAG->getTopRPTracker(), Cand&: TopCand); |
331 | assert(TopCand.Reason != NoCand && "failed to find a candidate" ); |
332 | SU = TopCand.SU; |
333 | } |
334 | IsTopNode = true; |
335 | } else if (RegionPolicy.OnlyBottomUp) { |
336 | SU = Bot.pickOnlyChoice(); |
337 | if (!SU) { |
338 | CandPolicy NoPolicy; |
339 | BotCand.reset(NewPolicy: NoPolicy); |
340 | pickNodeFromQueue(Zone&: Bot, ZonePolicy: NoPolicy, RPTracker: DAG->getBotRPTracker(), Cand&: BotCand); |
341 | assert(BotCand.Reason != NoCand && "failed to find a candidate" ); |
342 | SU = BotCand.SU; |
343 | } |
344 | IsTopNode = false; |
345 | } else { |
346 | SU = pickNodeBidirectional(IsTopNode); |
347 | } |
348 | } while (SU->isScheduled); |
349 | |
350 | if (SU->isTopReady()) |
351 | Top.removeReady(SU); |
352 | if (SU->isBottomReady()) |
353 | Bot.removeReady(SU); |
354 | |
355 | LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " |
356 | << *SU->getInstr()); |
357 | return SU; |
358 | } |
359 | |
360 | GCNSchedStageID GCNSchedStrategy::getCurrentStage() { |
361 | assert(CurrentStage && CurrentStage != SchedStages.end()); |
362 | return *CurrentStage; |
363 | } |
364 | |
365 | bool GCNSchedStrategy::advanceStage() { |
366 | assert(CurrentStage != SchedStages.end()); |
367 | if (!CurrentStage) |
368 | CurrentStage = SchedStages.begin(); |
369 | else |
370 | CurrentStage++; |
371 | |
372 | return CurrentStage != SchedStages.end(); |
373 | } |
374 | |
375 | bool GCNSchedStrategy::hasNextStage() const { |
376 | assert(CurrentStage); |
377 | return std::next(x: CurrentStage) != SchedStages.end(); |
378 | } |
379 | |
380 | GCNSchedStageID GCNSchedStrategy::getNextStage() const { |
381 | assert(CurrentStage && std::next(CurrentStage) != SchedStages.end()); |
382 | return *std::next(x: CurrentStage); |
383 | } |
384 | |
385 | GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( |
386 | const MachineSchedContext *C) |
387 | : GCNSchedStrategy(C) { |
388 | SchedStages.push_back(Elt: GCNSchedStageID::OccInitialSchedule); |
389 | SchedStages.push_back(Elt: GCNSchedStageID::UnclusteredHighRPReschedule); |
390 | SchedStages.push_back(Elt: GCNSchedStageID::ClusteredLowOccupancyReschedule); |
391 | SchedStages.push_back(Elt: GCNSchedStageID::PreRARematerialize); |
392 | } |
393 | |
394 | GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C) |
395 | : GCNSchedStrategy(C) { |
396 | SchedStages.push_back(Elt: GCNSchedStageID::ILPInitialSchedule); |
397 | } |
398 | |
399 | bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand, |
400 | SchedCandidate &TryCand, |
401 | SchedBoundary *Zone) const { |
402 | // Initialize the candidate if needed. |
403 | if (!Cand.isValid()) { |
404 | TryCand.Reason = NodeOrder; |
405 | return true; |
406 | } |
407 | |
408 | // Avoid spilling by exceeding the register limit. |
409 | if (DAG->isTrackingPressure() && |
410 | tryPressure(TryP: TryCand.RPDelta.Excess, CandP: Cand.RPDelta.Excess, TryCand, Cand, |
411 | Reason: RegExcess, TRI, MF: DAG->MF)) |
412 | return TryCand.Reason != NoCand; |
413 | |
414 | // Bias PhysReg Defs and copies to their uses and defined respectively. |
415 | if (tryGreater(TryVal: biasPhysReg(SU: TryCand.SU, isTop: TryCand.AtTop), |
416 | CandVal: biasPhysReg(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: PhysReg)) |
417 | return TryCand.Reason != NoCand; |
418 | |
419 | bool SameBoundary = Zone != nullptr; |
420 | if (SameBoundary) { |
421 | // Prioritize instructions that read unbuffered resources by stall cycles. |
422 | if (tryLess(TryVal: Zone->getLatencyStallCycles(SU: TryCand.SU), |
423 | CandVal: Zone->getLatencyStallCycles(SU: Cand.SU), TryCand, Cand, Reason: Stall)) |
424 | return TryCand.Reason != NoCand; |
425 | |
426 | // Avoid critical resource consumption and balance the schedule. |
427 | TryCand.initResourceDelta(DAG, SchedModel); |
428 | if (tryLess(TryVal: TryCand.ResDelta.CritResources, CandVal: Cand.ResDelta.CritResources, |
429 | TryCand, Cand, Reason: ResourceReduce)) |
430 | return TryCand.Reason != NoCand; |
431 | if (tryGreater(TryVal: TryCand.ResDelta.DemandedResources, |
432 | CandVal: Cand.ResDelta.DemandedResources, TryCand, Cand, |
433 | Reason: ResourceDemand)) |
434 | return TryCand.Reason != NoCand; |
435 | |
436 | // Unconditionally try to reduce latency. |
437 | if (tryLatency(TryCand, Cand, Zone&: *Zone)) |
438 | return TryCand.Reason != NoCand; |
439 | |
440 | // Weak edges are for clustering and other constraints. |
441 | if (tryLess(TryVal: getWeakLeft(SU: TryCand.SU, isTop: TryCand.AtTop), |
442 | CandVal: getWeakLeft(SU: Cand.SU, isTop: Cand.AtTop), TryCand, Cand, Reason: Weak)) |
443 | return TryCand.Reason != NoCand; |
444 | } |
445 | |
446 | // Keep clustered nodes together to encourage downstream peephole |
447 | // optimizations which may reduce resource requirements. |
448 | // |
449 | // This is a best effort to set things up for a post-RA pass. Optimizations |
450 | // like generating loads of multiple registers should ideally be done within |
451 | // the scheduler pass by combining the loads during DAG postprocessing. |
452 | const SUnit *CandNextClusterSU = |
453 | Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); |
454 | const SUnit *TryCandNextClusterSU = |
455 | TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); |
456 | if (tryGreater(TryVal: TryCand.SU == TryCandNextClusterSU, |
457 | CandVal: Cand.SU == CandNextClusterSU, TryCand, Cand, Reason: Cluster)) |
458 | return TryCand.Reason != NoCand; |
459 | |
460 | // Avoid increasing the max critical pressure in the scheduled region. |
461 | if (DAG->isTrackingPressure() && |
462 | tryPressure(TryP: TryCand.RPDelta.CriticalMax, CandP: Cand.RPDelta.CriticalMax, |
463 | TryCand, Cand, Reason: RegCritical, TRI, MF: DAG->MF)) |
464 | return TryCand.Reason != NoCand; |
465 | |
466 | // Avoid increasing the max pressure of the entire region. |
467 | if (DAG->isTrackingPressure() && |
468 | tryPressure(TryP: TryCand.RPDelta.CurrentMax, CandP: Cand.RPDelta.CurrentMax, TryCand, |
469 | Cand, Reason: RegMax, TRI, MF: DAG->MF)) |
470 | return TryCand.Reason != NoCand; |
471 | |
472 | if (SameBoundary) { |
473 | // Fall through to original instruction order. |
474 | if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) || |
475 | (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { |
476 | TryCand.Reason = NodeOrder; |
477 | return true; |
478 | } |
479 | } |
480 | return false; |
481 | } |
482 | |
483 | GCNScheduleDAGMILive::GCNScheduleDAGMILive( |
484 | MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S) |
485 | : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()), |
486 | MFI(*MF.getInfo<SIMachineFunctionInfo>()), |
487 | StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy) { |
488 | |
489 | LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n" ); |
490 | if (RelaxedOcc) { |
491 | MinOccupancy = std::min(a: MFI.getMinAllowedOccupancy(), b: StartingOccupancy); |
492 | if (MinOccupancy != StartingOccupancy) |
493 | LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy |
494 | << ".\n" ); |
495 | } |
496 | } |
497 | |
498 | std::unique_ptr<GCNSchedStage> |
499 | GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) { |
500 | switch (SchedStageID) { |
501 | case GCNSchedStageID::OccInitialSchedule: |
502 | return std::make_unique<OccInitialScheduleStage>(args&: SchedStageID, args&: *this); |
503 | case GCNSchedStageID::UnclusteredHighRPReschedule: |
504 | return std::make_unique<UnclusteredHighRPStage>(args&: SchedStageID, args&: *this); |
505 | case GCNSchedStageID::ClusteredLowOccupancyReschedule: |
506 | return std::make_unique<ClusteredLowOccStage>(args&: SchedStageID, args&: *this); |
507 | case GCNSchedStageID::PreRARematerialize: |
508 | return std::make_unique<PreRARematStage>(args&: SchedStageID, args&: *this); |
509 | case GCNSchedStageID::ILPInitialSchedule: |
510 | return std::make_unique<ILPInitialScheduleStage>(args&: SchedStageID, args&: *this); |
511 | } |
512 | |
513 | llvm_unreachable("Unknown SchedStageID." ); |
514 | } |
515 | |
516 | void GCNScheduleDAGMILive::schedule() { |
517 | // Collect all scheduling regions. The actual scheduling is performed in |
518 | // GCNScheduleDAGMILive::finalizeSchedule. |
519 | Regions.push_back(Elt: std::pair(RegionBegin, RegionEnd)); |
520 | } |
521 | |
522 | GCNRegPressure |
523 | GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const { |
524 | GCNDownwardRPTracker RPTracker(*LIS); |
525 | RPTracker.advance(Begin: begin(), End: end(), LiveRegsCopy: &LiveIns[RegionIdx]); |
526 | return RPTracker.moveMaxPressure(); |
527 | } |
528 | |
529 | void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx, |
530 | const MachineBasicBlock *MBB) { |
531 | GCNDownwardRPTracker RPTracker(*LIS); |
532 | |
533 | // If the block has the only successor then live-ins of that successor are |
534 | // live-outs of the current block. We can reuse calculated live set if the |
535 | // successor will be sent to scheduling past current block. |
536 | |
537 | // However, due to the bug in LiveInterval analysis it may happen that two |
538 | // predecessors of the same successor block have different lane bitmasks for |
539 | // a live-out register. Workaround that by sticking to one-to-one relationship |
540 | // i.e. one predecessor with one successor block. |
541 | const MachineBasicBlock *OnlySucc = nullptr; |
542 | if (MBB->succ_size() == 1) { |
543 | auto *Candidate = *MBB->succ_begin(); |
544 | if (!Candidate->empty() && Candidate->pred_size() == 1) { |
545 | SlotIndexes *Ind = LIS->getSlotIndexes(); |
546 | if (Ind->getMBBStartIdx(mbb: MBB) < Ind->getMBBStartIdx(mbb: Candidate)) |
547 | OnlySucc = Candidate; |
548 | } |
549 | } |
550 | |
551 | // Scheduler sends regions from the end of the block upwards. |
552 | size_t CurRegion = RegionIdx; |
553 | for (size_t E = Regions.size(); CurRegion != E; ++CurRegion) |
554 | if (Regions[CurRegion].first->getParent() != MBB) |
555 | break; |
556 | --CurRegion; |
557 | |
558 | auto I = MBB->begin(); |
559 | auto LiveInIt = MBBLiveIns.find(Val: MBB); |
560 | auto &Rgn = Regions[CurRegion]; |
561 | auto *NonDbgMI = &*skipDebugInstructionsForward(It: Rgn.first, End: Rgn.second); |
562 | if (LiveInIt != MBBLiveIns.end()) { |
563 | auto LiveIn = std::move(LiveInIt->second); |
564 | RPTracker.reset(MI: *MBB->begin(), LiveRegs: &LiveIn); |
565 | MBBLiveIns.erase(I: LiveInIt); |
566 | } else { |
567 | I = Rgn.first; |
568 | auto LRS = BBLiveInMap.lookup(Val: NonDbgMI); |
569 | #ifdef EXPENSIVE_CHECKS |
570 | assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS)); |
571 | #endif |
572 | RPTracker.reset(MI: *I, LiveRegs: &LRS); |
573 | } |
574 | |
575 | for (;;) { |
576 | I = RPTracker.getNext(); |
577 | |
578 | if (Regions[CurRegion].first == I || NonDbgMI == I) { |
579 | LiveIns[CurRegion] = RPTracker.getLiveRegs(); |
580 | RPTracker.clearMaxPressure(); |
581 | } |
582 | |
583 | if (Regions[CurRegion].second == I) { |
584 | Pressure[CurRegion] = RPTracker.moveMaxPressure(); |
585 | if (CurRegion-- == RegionIdx) |
586 | break; |
587 | } |
588 | RPTracker.advanceToNext(); |
589 | RPTracker.advanceBeforeNext(); |
590 | } |
591 | |
592 | if (OnlySucc) { |
593 | if (I != MBB->end()) { |
594 | RPTracker.advanceToNext(); |
595 | RPTracker.advance(End: MBB->end()); |
596 | } |
597 | RPTracker.advanceBeforeNext(); |
598 | MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs(); |
599 | } |
600 | } |
601 | |
602 | DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> |
603 | GCNScheduleDAGMILive::getBBLiveInMap() const { |
604 | assert(!Regions.empty()); |
605 | std::vector<MachineInstr *> BBStarters; |
606 | BBStarters.reserve(n: Regions.size()); |
607 | auto I = Regions.rbegin(), E = Regions.rend(); |
608 | auto *BB = I->first->getParent(); |
609 | do { |
610 | auto *MI = &*skipDebugInstructionsForward(It: I->first, End: I->second); |
611 | BBStarters.push_back(x: MI); |
612 | do { |
613 | ++I; |
614 | } while (I != E && I->first->getParent() == BB); |
615 | } while (I != E); |
616 | return getLiveRegMap(R&: BBStarters, After: false /*After*/, LIS&: *LIS); |
617 | } |
618 | |
619 | void GCNScheduleDAGMILive::finalizeSchedule() { |
620 | // Start actual scheduling here. This function is called by the base |
621 | // MachineScheduler after all regions have been recorded by |
622 | // GCNScheduleDAGMILive::schedule(). |
623 | LiveIns.resize(N: Regions.size()); |
624 | Pressure.resize(N: Regions.size()); |
625 | RescheduleRegions.resize(N: Regions.size()); |
626 | RegionsWithHighRP.resize(N: Regions.size()); |
627 | RegionsWithExcessRP.resize(N: Regions.size()); |
628 | RegionsWithMinOcc.resize(N: Regions.size()); |
629 | RegionsWithIGLPInstrs.resize(N: Regions.size()); |
630 | RescheduleRegions.set(); |
631 | RegionsWithHighRP.reset(); |
632 | RegionsWithExcessRP.reset(); |
633 | RegionsWithMinOcc.reset(); |
634 | RegionsWithIGLPInstrs.reset(); |
635 | |
636 | runSchedStages(); |
637 | } |
638 | |
639 | void GCNScheduleDAGMILive::runSchedStages() { |
640 | LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n" ); |
641 | |
642 | if (!Regions.empty()) |
643 | BBLiveInMap = getBBLiveInMap(); |
644 | |
645 | GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl); |
646 | while (S.advanceStage()) { |
647 | auto Stage = createSchedStage(SchedStageID: S.getCurrentStage()); |
648 | if (!Stage->initGCNSchedStage()) |
649 | continue; |
650 | |
651 | for (auto Region : Regions) { |
652 | RegionBegin = Region.first; |
653 | RegionEnd = Region.second; |
654 | // Setup for scheduling the region and check whether it should be skipped. |
655 | if (!Stage->initGCNRegion()) { |
656 | Stage->advanceRegion(); |
657 | exitRegion(); |
658 | continue; |
659 | } |
660 | |
661 | ScheduleDAGMILive::schedule(); |
662 | Stage->finalizeGCNRegion(); |
663 | } |
664 | |
665 | Stage->finalizeGCNSchedStage(); |
666 | } |
667 | } |
668 | |
669 | #ifndef NDEBUG |
670 | raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) { |
671 | switch (StageID) { |
672 | case GCNSchedStageID::OccInitialSchedule: |
673 | OS << "Max Occupancy Initial Schedule" ; |
674 | break; |
675 | case GCNSchedStageID::UnclusteredHighRPReschedule: |
676 | OS << "Unclustered High Register Pressure Reschedule" ; |
677 | break; |
678 | case GCNSchedStageID::ClusteredLowOccupancyReschedule: |
679 | OS << "Clustered Low Occupancy Reschedule" ; |
680 | break; |
681 | case GCNSchedStageID::PreRARematerialize: |
682 | OS << "Pre-RA Rematerialize" ; |
683 | break; |
684 | case GCNSchedStageID::ILPInitialSchedule: |
685 | OS << "Max ILP Initial Schedule" ; |
686 | break; |
687 | } |
688 | |
689 | return OS; |
690 | } |
691 | #endif |
692 | |
693 | GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) |
694 | : DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF), |
695 | MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {} |
696 | |
697 | bool GCNSchedStage::initGCNSchedStage() { |
698 | if (!DAG.LIS) |
699 | return false; |
700 | |
701 | LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n" ); |
702 | return true; |
703 | } |
704 | |
705 | bool UnclusteredHighRPStage::initGCNSchedStage() { |
706 | if (DisableUnclusterHighRP) |
707 | return false; |
708 | |
709 | if (!GCNSchedStage::initGCNSchedStage()) |
710 | return false; |
711 | |
712 | if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none()) |
713 | return false; |
714 | |
715 | SavedMutations.swap(x&: DAG.Mutations); |
716 | DAG.addMutation( |
717 | Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::PreRAReentry)); |
718 | |
719 | InitialOccupancy = DAG.MinOccupancy; |
720 | // Aggressivly try to reduce register pressure in the unclustered high RP |
721 | // stage. Temporarily increase occupancy target in the region. |
722 | S.SGPRLimitBias = S.HighRPSGPRBias; |
723 | S.VGPRLimitBias = S.HighRPVGPRBias; |
724 | if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) |
725 | MFI.increaseOccupancy(MF, Limit: ++DAG.MinOccupancy); |
726 | |
727 | LLVM_DEBUG( |
728 | dbgs() |
729 | << "Retrying function scheduling without clustering. " |
730 | "Aggressivly try to reduce register pressure to achieve occupancy " |
731 | << DAG.MinOccupancy << ".\n" ); |
732 | |
733 | return true; |
734 | } |
735 | |
736 | bool ClusteredLowOccStage::initGCNSchedStage() { |
737 | if (DisableClusteredLowOccupancy) |
738 | return false; |
739 | |
740 | if (!GCNSchedStage::initGCNSchedStage()) |
741 | return false; |
742 | |
743 | // Don't bother trying to improve ILP in lower RP regions if occupancy has not |
744 | // been dropped. All regions will have already been scheduled with the ideal |
745 | // occupancy targets. |
746 | if (DAG.StartingOccupancy <= DAG.MinOccupancy) |
747 | return false; |
748 | |
749 | LLVM_DEBUG( |
750 | dbgs() << "Retrying function scheduling with lowest recorded occupancy " |
751 | << DAG.MinOccupancy << ".\n" ); |
752 | return true; |
753 | } |
754 | |
755 | bool PreRARematStage::initGCNSchedStage() { |
756 | if (!GCNSchedStage::initGCNSchedStage()) |
757 | return false; |
758 | |
759 | if (DAG.RegionsWithMinOcc.none() || DAG.Regions.size() == 1) |
760 | return false; |
761 | |
762 | const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); |
763 | // Check maximum occupancy |
764 | if (ST.computeOccupancy(F: MF.getFunction(), LDSSize: MFI.getLDSSize()) == |
765 | DAG.MinOccupancy) |
766 | return false; |
767 | |
768 | // FIXME: This pass will invalidate cached MBBLiveIns for regions |
769 | // inbetween the defs and region we sinked the def to. Cached pressure |
770 | // for regions where a def is sinked from will also be invalidated. Will |
771 | // need to be fixed if there is another pass after this pass. |
772 | assert(!S.hasNextStage()); |
773 | |
774 | collectRematerializableInstructions(); |
775 | if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII)) |
776 | return false; |
777 | |
778 | LLVM_DEBUG( |
779 | dbgs() << "Retrying function scheduling with improved occupancy of " |
780 | << DAG.MinOccupancy << " from rematerializing\n" ); |
781 | return true; |
782 | } |
783 | |
784 | void GCNSchedStage::finalizeGCNSchedStage() { |
785 | DAG.finishBlock(); |
786 | LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n" ); |
787 | } |
788 | |
789 | void UnclusteredHighRPStage::finalizeGCNSchedStage() { |
790 | SavedMutations.swap(x&: DAG.Mutations); |
791 | S.SGPRLimitBias = S.VGPRLimitBias = 0; |
792 | if (DAG.MinOccupancy > InitialOccupancy) { |
793 | for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX) |
794 | DAG.RegionsWithMinOcc[IDX] = |
795 | DAG.Pressure[IDX].getOccupancy(ST: DAG.ST) == DAG.MinOccupancy; |
796 | |
797 | LLVM_DEBUG(dbgs() << StageID |
798 | << " stage successfully increased occupancy to " |
799 | << DAG.MinOccupancy << '\n'); |
800 | } |
801 | |
802 | GCNSchedStage::finalizeGCNSchedStage(); |
803 | } |
804 | |
805 | bool GCNSchedStage::initGCNRegion() { |
806 | // Check whether this new region is also a new block. |
807 | if (DAG.RegionBegin->getParent() != CurrentMBB) |
808 | setupNewBlock(); |
809 | |
810 | unsigned NumRegionInstrs = std::distance(first: DAG.begin(), last: DAG.end()); |
811 | DAG.enterRegion(bb: CurrentMBB, begin: DAG.begin(), end: DAG.end(), regioninstrs: NumRegionInstrs); |
812 | |
813 | // Skip empty scheduling regions (0 or 1 schedulable instructions). |
814 | if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(x: DAG.end())) |
815 | return false; |
816 | |
817 | LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n" ); |
818 | LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB) |
819 | << " " << CurrentMBB->getName() |
820 | << "\n From: " << *DAG.begin() << " To: " ; |
821 | if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd; |
822 | else dbgs() << "End" ; |
823 | dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n'); |
824 | |
825 | // Save original instruction order before scheduling for possible revert. |
826 | Unsched.clear(); |
827 | Unsched.reserve(n: DAG.NumRegionInstrs); |
828 | if (StageID == GCNSchedStageID::OccInitialSchedule || |
829 | StageID == GCNSchedStageID::ILPInitialSchedule) { |
830 | for (auto &I : DAG) { |
831 | Unsched.push_back(x: &I); |
832 | if (I.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || |
833 | I.getOpcode() == AMDGPU::IGLP_OPT) |
834 | DAG.RegionsWithIGLPInstrs[RegionIdx] = true; |
835 | } |
836 | } else { |
837 | for (auto &I : DAG) |
838 | Unsched.push_back(x: &I); |
839 | } |
840 | |
841 | PressureBefore = DAG.Pressure[RegionIdx]; |
842 | |
843 | LLVM_DEBUG( |
844 | dbgs() << "Pressure before scheduling:\nRegion live-ins:" |
845 | << print(DAG.LiveIns[RegionIdx], DAG.MRI) |
846 | << "Region live-in pressure: " |
847 | << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx])) |
848 | << "Region register pressure: " << print(PressureBefore)); |
849 | |
850 | S.HasHighPressure = false; |
851 | S.KnownExcessRP = isRegionWithExcessRP(); |
852 | |
853 | if (DAG.RegionsWithIGLPInstrs[RegionIdx] && |
854 | StageID != GCNSchedStageID::UnclusteredHighRPReschedule) { |
855 | SavedMutations.clear(); |
856 | SavedMutations.swap(x&: DAG.Mutations); |
857 | bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule || |
858 | StageID == GCNSchedStageID::ILPInitialSchedule; |
859 | DAG.addMutation(Mutation: createIGroupLPDAGMutation( |
860 | Phase: IsInitialStage ? AMDGPU::SchedulingPhase::Initial |
861 | : AMDGPU::SchedulingPhase::PreRAReentry)); |
862 | } |
863 | |
864 | return true; |
865 | } |
866 | |
867 | bool UnclusteredHighRPStage::initGCNRegion() { |
868 | // Only reschedule regions with the minimum occupancy or regions that may have |
869 | // spilling (excess register pressure). |
870 | if ((!DAG.RegionsWithMinOcc[RegionIdx] || |
871 | DAG.MinOccupancy <= InitialOccupancy) && |
872 | !DAG.RegionsWithExcessRP[RegionIdx]) |
873 | return false; |
874 | |
875 | return GCNSchedStage::initGCNRegion(); |
876 | } |
877 | |
878 | bool ClusteredLowOccStage::initGCNRegion() { |
879 | // We may need to reschedule this region if it wasn't rescheduled in the last |
880 | // stage, or if we found it was testing critical register pressure limits in |
881 | // the unclustered reschedule stage. The later is because we may not have been |
882 | // able to raise the min occupancy in the previous stage so the region may be |
883 | // overly constrained even if it was already rescheduled. |
884 | if (!DAG.RegionsWithHighRP[RegionIdx]) |
885 | return false; |
886 | |
887 | return GCNSchedStage::initGCNRegion(); |
888 | } |
889 | |
890 | bool PreRARematStage::initGCNRegion() { |
891 | if (!DAG.RescheduleRegions[RegionIdx]) |
892 | return false; |
893 | |
894 | return GCNSchedStage::initGCNRegion(); |
895 | } |
896 | |
897 | void GCNSchedStage::setupNewBlock() { |
898 | if (CurrentMBB) |
899 | DAG.finishBlock(); |
900 | |
901 | CurrentMBB = DAG.RegionBegin->getParent(); |
902 | DAG.startBlock(bb: CurrentMBB); |
903 | // Get real RP for the region if it hasn't be calculated before. After the |
904 | // initial schedule stage real RP will be collected after scheduling. |
905 | if (StageID == GCNSchedStageID::OccInitialSchedule || |
906 | StageID == GCNSchedStageID::ILPInitialSchedule) |
907 | DAG.computeBlockPressure(RegionIdx, MBB: CurrentMBB); |
908 | } |
909 | |
910 | void GCNSchedStage::finalizeGCNRegion() { |
911 | DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd); |
912 | DAG.RescheduleRegions[RegionIdx] = false; |
913 | if (S.HasHighPressure) |
914 | DAG.RegionsWithHighRP[RegionIdx] = true; |
915 | |
916 | // Revert scheduling if we have dropped occupancy or there is some other |
917 | // reason that the original schedule is better. |
918 | checkScheduling(); |
919 | |
920 | if (DAG.RegionsWithIGLPInstrs[RegionIdx] && |
921 | StageID != GCNSchedStageID::UnclusteredHighRPReschedule) |
922 | SavedMutations.swap(x&: DAG.Mutations); |
923 | |
924 | DAG.exitRegion(); |
925 | RegionIdx++; |
926 | } |
927 | |
928 | void GCNSchedStage::checkScheduling() { |
929 | // Check the results of scheduling. |
930 | PressureAfter = DAG.getRealRegPressure(RegionIdx); |
931 | LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter)); |
932 | LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n" ); |
933 | |
934 | if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && |
935 | PressureAfter.getVGPRNum(UnifiedVGPRFile: ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) { |
936 | DAG.Pressure[RegionIdx] = PressureAfter; |
937 | DAG.RegionsWithMinOcc[RegionIdx] = |
938 | PressureAfter.getOccupancy(ST) == DAG.MinOccupancy; |
939 | |
940 | // Early out if we have achieved the occupancy target. |
941 | LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n" ); |
942 | return; |
943 | } |
944 | |
945 | unsigned TargetOccupancy = |
946 | std::min(a: S.getTargetOccupancy(), b: ST.getOccupancyWithLocalMemSize(MF)); |
947 | unsigned WavesAfter = |
948 | std::min(a: TargetOccupancy, b: PressureAfter.getOccupancy(ST)); |
949 | unsigned WavesBefore = |
950 | std::min(a: TargetOccupancy, b: PressureBefore.getOccupancy(ST)); |
951 | LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore |
952 | << ", after " << WavesAfter << ".\n" ); |
953 | |
954 | // We may not be able to keep the current target occupancy because of the just |
955 | // scheduled region. We might still be able to revert scheduling if the |
956 | // occupancy before was higher, or if the current schedule has register |
957 | // pressure higher than the excess limits which could lead to more spilling. |
958 | unsigned NewOccupancy = std::max(a: WavesAfter, b: WavesBefore); |
959 | |
960 | // Allow memory bound functions to drop to 4 waves if not limited by an |
961 | // attribute. |
962 | if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy && |
963 | WavesAfter >= MFI.getMinAllowedOccupancy()) { |
964 | LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to " |
965 | << MFI.getMinAllowedOccupancy() << " waves\n" ); |
966 | NewOccupancy = WavesAfter; |
967 | } |
968 | |
969 | if (NewOccupancy < DAG.MinOccupancy) { |
970 | DAG.MinOccupancy = NewOccupancy; |
971 | MFI.limitOccupancy(Limit: DAG.MinOccupancy); |
972 | DAG.RegionsWithMinOcc.reset(); |
973 | LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to " |
974 | << DAG.MinOccupancy << ".\n" ); |
975 | } |
976 | // The maximum number of arch VGPR on non-unified register file, or the |
977 | // maximum VGPR + AGPR in the unified register file case. |
978 | unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF); |
979 | // The maximum number of arch VGPR for both unified and non-unified register |
980 | // file. |
981 | unsigned MaxArchVGPRs = std::min(a: MaxVGPRs, b: ST.getAddressableNumArchVGPRs()); |
982 | unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF); |
983 | |
984 | if (PressureAfter.getVGPRNum(UnifiedVGPRFile: ST.hasGFX90AInsts()) > MaxVGPRs || |
985 | PressureAfter.getVGPRNum(UnifiedVGPRFile: false) > MaxArchVGPRs || |
986 | PressureAfter.getAGPRNum() > MaxArchVGPRs || |
987 | PressureAfter.getSGPRNum() > MaxSGPRs) { |
988 | DAG.RescheduleRegions[RegionIdx] = true; |
989 | DAG.RegionsWithHighRP[RegionIdx] = true; |
990 | DAG.RegionsWithExcessRP[RegionIdx] = true; |
991 | } |
992 | |
993 | // Revert if this region's schedule would cause a drop in occupancy or |
994 | // spilling. |
995 | if (shouldRevertScheduling(WavesAfter)) { |
996 | revertScheduling(); |
997 | } else { |
998 | DAG.Pressure[RegionIdx] = PressureAfter; |
999 | DAG.RegionsWithMinOcc[RegionIdx] = |
1000 | PressureAfter.getOccupancy(ST) == DAG.MinOccupancy; |
1001 | } |
1002 | } |
1003 | |
1004 | unsigned |
1005 | GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle, |
1006 | DenseMap<unsigned, unsigned> &ReadyCycles, |
1007 | const TargetSchedModel &SM) { |
1008 | unsigned ReadyCycle = CurrCycle; |
1009 | for (auto &D : SU.Preds) { |
1010 | if (D.isAssignedRegDep()) { |
1011 | MachineInstr *DefMI = D.getSUnit()->getInstr(); |
1012 | unsigned Latency = SM.computeInstrLatency(MI: DefMI); |
1013 | unsigned DefReady = ReadyCycles[DAG.getSUnit(MI: DefMI)->NodeNum]; |
1014 | ReadyCycle = std::max(a: ReadyCycle, b: DefReady + Latency); |
1015 | } |
1016 | } |
1017 | ReadyCycles[SU.NodeNum] = ReadyCycle; |
1018 | return ReadyCycle; |
1019 | } |
1020 | |
1021 | #ifndef NDEBUG |
1022 | struct EarlierIssuingCycle { |
1023 | bool operator()(std::pair<MachineInstr *, unsigned> A, |
1024 | std::pair<MachineInstr *, unsigned> B) const { |
1025 | return A.second < B.second; |
1026 | } |
1027 | }; |
1028 | |
1029 | static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>, |
1030 | EarlierIssuingCycle> &ReadyCycles) { |
1031 | if (ReadyCycles.empty()) |
1032 | return; |
1033 | unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber(); |
1034 | dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum |
1035 | << " ##################\n# Cycle #\t\t\tInstruction " |
1036 | " " |
1037 | " \n" ; |
1038 | unsigned IPrev = 1; |
1039 | for (auto &I : ReadyCycles) { |
1040 | if (I.second > IPrev + 1) |
1041 | dbgs() << "****************************** BUBBLE OF " << I.second - IPrev |
1042 | << " CYCLES DETECTED ******************************\n\n" ; |
1043 | dbgs() << "[ " << I.second << " ] : " << *I.first << "\n" ; |
1044 | IPrev = I.second; |
1045 | } |
1046 | } |
1047 | #endif |
1048 | |
1049 | ScheduleMetrics |
1050 | GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) { |
1051 | #ifndef NDEBUG |
1052 | std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle> |
1053 | ReadyCyclesSorted; |
1054 | #endif |
1055 | const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel(); |
1056 | unsigned SumBubbles = 0; |
1057 | DenseMap<unsigned, unsigned> ReadyCycles; |
1058 | unsigned CurrCycle = 0; |
1059 | for (auto &SU : InputSchedule) { |
1060 | unsigned ReadyCycle = |
1061 | computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM); |
1062 | SumBubbles += ReadyCycle - CurrCycle; |
1063 | #ifndef NDEBUG |
1064 | ReadyCyclesSorted.insert(x: std::make_pair(x: SU.getInstr(), y&: ReadyCycle)); |
1065 | #endif |
1066 | CurrCycle = ++ReadyCycle; |
1067 | } |
1068 | #ifndef NDEBUG |
1069 | LLVM_DEBUG( |
1070 | printScheduleModel(ReadyCyclesSorted); |
1071 | dbgs() << "\n\t" |
1072 | << "Metric: " |
1073 | << (SumBubbles |
1074 | ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle |
1075 | : 1) |
1076 | << "\n\n" ); |
1077 | #endif |
1078 | |
1079 | return ScheduleMetrics(CurrCycle, SumBubbles); |
1080 | } |
1081 | |
1082 | ScheduleMetrics |
1083 | GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) { |
1084 | #ifndef NDEBUG |
1085 | std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle> |
1086 | ReadyCyclesSorted; |
1087 | #endif |
1088 | const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel(); |
1089 | unsigned SumBubbles = 0; |
1090 | DenseMap<unsigned, unsigned> ReadyCycles; |
1091 | unsigned CurrCycle = 0; |
1092 | for (auto &MI : DAG) { |
1093 | SUnit *SU = DAG.getSUnit(MI: &MI); |
1094 | if (!SU) |
1095 | continue; |
1096 | unsigned ReadyCycle = |
1097 | computeSUnitReadyCycle(SU: *SU, CurrCycle, ReadyCycles, SM); |
1098 | SumBubbles += ReadyCycle - CurrCycle; |
1099 | #ifndef NDEBUG |
1100 | ReadyCyclesSorted.insert(x: std::make_pair(x: SU->getInstr(), y&: ReadyCycle)); |
1101 | #endif |
1102 | CurrCycle = ++ReadyCycle; |
1103 | } |
1104 | #ifndef NDEBUG |
1105 | LLVM_DEBUG( |
1106 | printScheduleModel(ReadyCyclesSorted); |
1107 | dbgs() << "\n\t" |
1108 | << "Metric: " |
1109 | << (SumBubbles |
1110 | ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle |
1111 | : 1) |
1112 | << "\n\n" ); |
1113 | #endif |
1114 | |
1115 | return ScheduleMetrics(CurrCycle, SumBubbles); |
1116 | } |
1117 | |
1118 | bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) { |
1119 | if (WavesAfter < DAG.MinOccupancy) |
1120 | return true; |
1121 | |
1122 | return false; |
1123 | } |
1124 | |
1125 | bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { |
1126 | if (PressureAfter == PressureBefore) |
1127 | return false; |
1128 | |
1129 | if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) |
1130 | return true; |
1131 | |
1132 | if (mayCauseSpilling(WavesAfter)) |
1133 | return true; |
1134 | |
1135 | return false; |
1136 | } |
1137 | |
1138 | bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) { |
1139 | // If RP is not reduced in the unclustered reschedule stage, revert to the |
1140 | // old schedule. |
1141 | if ((WavesAfter <= PressureBefore.getOccupancy(ST) && |
1142 | mayCauseSpilling(WavesAfter)) || |
1143 | GCNSchedStage::shouldRevertScheduling(WavesAfter)) { |
1144 | LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n" ); |
1145 | return true; |
1146 | } |
1147 | |
1148 | // Do not attempt to relax schedule even more if we are already spilling. |
1149 | if (isRegionWithExcessRP()) |
1150 | return false; |
1151 | |
1152 | LLVM_DEBUG( |
1153 | dbgs() |
1154 | << "\n\t *** In shouldRevertScheduling ***\n" |
1155 | << " *********** BEFORE UnclusteredHighRPStage ***********\n" ); |
1156 | ScheduleMetrics MBefore = |
1157 | getScheduleMetrics(InputSchedule: DAG.SUnits); |
1158 | LLVM_DEBUG( |
1159 | dbgs() |
1160 | << "\n *********** AFTER UnclusteredHighRPStage ***********\n" ); |
1161 | ScheduleMetrics MAfter = getScheduleMetrics(DAG); |
1162 | unsigned OldMetric = MBefore.getMetric(); |
1163 | unsigned NewMetric = MAfter.getMetric(); |
1164 | unsigned WavesBefore = |
1165 | std::min(a: S.getTargetOccupancy(), b: PressureBefore.getOccupancy(ST)); |
1166 | unsigned Profit = |
1167 | ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore * |
1168 | ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) / |
1169 | NewMetric) / |
1170 | ScheduleMetrics::ScaleFactor; |
1171 | LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after " |
1172 | << MAfter << "Profit: " << Profit << "\n" ); |
1173 | return Profit < ScheduleMetrics::ScaleFactor; |
1174 | } |
1175 | |
1176 | bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) { |
1177 | if (PressureAfter == PressureBefore) |
1178 | return false; |
1179 | |
1180 | if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) |
1181 | return true; |
1182 | |
1183 | if (mayCauseSpilling(WavesAfter)) |
1184 | return true; |
1185 | |
1186 | return false; |
1187 | } |
1188 | |
1189 | bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) { |
1190 | if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) |
1191 | return true; |
1192 | |
1193 | if (mayCauseSpilling(WavesAfter)) |
1194 | return true; |
1195 | |
1196 | return false; |
1197 | } |
1198 | |
1199 | bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { |
1200 | if (mayCauseSpilling(WavesAfter)) |
1201 | return true; |
1202 | |
1203 | return false; |
1204 | } |
1205 | |
1206 | bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) { |
1207 | if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() && |
1208 | !PressureAfter.less(MF, O: PressureBefore)) { |
1209 | LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n" ); |
1210 | return true; |
1211 | } |
1212 | |
1213 | return false; |
1214 | } |
1215 | |
1216 | void GCNSchedStage::revertScheduling() { |
1217 | DAG.RegionsWithMinOcc[RegionIdx] = |
1218 | PressureBefore.getOccupancy(ST) == DAG.MinOccupancy; |
1219 | LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n" ); |
1220 | DAG.RescheduleRegions[RegionIdx] = |
1221 | S.hasNextStage() && |
1222 | S.getNextStage() != GCNSchedStageID::UnclusteredHighRPReschedule; |
1223 | DAG.RegionEnd = DAG.RegionBegin; |
1224 | int SkippedDebugInstr = 0; |
1225 | for (MachineInstr *MI : Unsched) { |
1226 | if (MI->isDebugInstr()) { |
1227 | ++SkippedDebugInstr; |
1228 | continue; |
1229 | } |
1230 | |
1231 | if (MI->getIterator() != DAG.RegionEnd) { |
1232 | DAG.BB->remove(I: MI); |
1233 | DAG.BB->insert(I: DAG.RegionEnd, MI); |
1234 | if (!MI->isDebugInstr()) |
1235 | DAG.LIS->handleMove(MI&: *MI, UpdateFlags: true); |
1236 | } |
1237 | |
1238 | // Reset read-undef flags and update them later. |
1239 | for (auto &Op : MI->all_defs()) |
1240 | Op.setIsUndef(false); |
1241 | RegisterOperands RegOpers; |
1242 | RegOpers.collect(MI: *MI, TRI: *DAG.TRI, MRI: DAG.MRI, TrackLaneMasks: DAG.ShouldTrackLaneMasks, IgnoreDead: false); |
1243 | if (!MI->isDebugInstr()) { |
1244 | if (DAG.ShouldTrackLaneMasks) { |
1245 | // Adjust liveness and add missing dead+read-undef flags. |
1246 | SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(Instr: *MI).getRegSlot(); |
1247 | RegOpers.adjustLaneLiveness(LIS: *DAG.LIS, MRI: DAG.MRI, Pos: SlotIdx, AddFlagsMI: MI); |
1248 | } else { |
1249 | // Adjust for missing dead-def flags. |
1250 | RegOpers.detectDeadDefs(MI: *MI, LIS: *DAG.LIS); |
1251 | } |
1252 | } |
1253 | DAG.RegionEnd = MI->getIterator(); |
1254 | ++DAG.RegionEnd; |
1255 | LLVM_DEBUG(dbgs() << "Scheduling " << *MI); |
1256 | } |
1257 | |
1258 | // After reverting schedule, debug instrs will now be at the end of the block |
1259 | // and RegionEnd will point to the first debug instr. Increment RegionEnd |
1260 | // pass debug instrs to the actual end of the scheduling region. |
1261 | while (SkippedDebugInstr-- > 0) |
1262 | ++DAG.RegionEnd; |
1263 | |
1264 | // If Unsched.front() instruction is a debug instruction, this will actually |
1265 | // shrink the region since we moved all debug instructions to the end of the |
1266 | // block. Find the first instruction that is not a debug instruction. |
1267 | DAG.RegionBegin = Unsched.front()->getIterator(); |
1268 | if (DAG.RegionBegin->isDebugInstr()) { |
1269 | for (MachineInstr *MI : Unsched) { |
1270 | if (MI->isDebugInstr()) |
1271 | continue; |
1272 | DAG.RegionBegin = MI->getIterator(); |
1273 | break; |
1274 | } |
1275 | } |
1276 | |
1277 | // Then move the debug instructions back into their correct place and set |
1278 | // RegionBegin and RegionEnd if needed. |
1279 | DAG.placeDebugValues(); |
1280 | |
1281 | DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd); |
1282 | } |
1283 | |
1284 | void PreRARematStage::collectRematerializableInstructions() { |
1285 | const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI); |
1286 | for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) { |
1287 | Register Reg = Register::index2VirtReg(Index: I); |
1288 | if (!DAG.LIS->hasInterval(Reg)) |
1289 | continue; |
1290 | |
1291 | // TODO: Handle AGPR and SGPR rematerialization |
1292 | if (!SRI->isVGPRClass(RC: DAG.MRI.getRegClass(Reg)) || |
1293 | !DAG.MRI.hasOneDef(RegNo: Reg) || !DAG.MRI.hasOneNonDBGUse(RegNo: Reg)) |
1294 | continue; |
1295 | |
1296 | MachineOperand *Op = DAG.MRI.getOneDef(Reg); |
1297 | MachineInstr *Def = Op->getParent(); |
1298 | if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(MI: *Def)) |
1299 | continue; |
1300 | |
1301 | MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(RegNo: Reg); |
1302 | if (Def->getParent() == UseI->getParent()) |
1303 | continue; |
1304 | |
1305 | // We are only collecting defs that are defined in another block and are |
1306 | // live-through or used inside regions at MinOccupancy. This means that the |
1307 | // register must be in the live-in set for the region. |
1308 | bool AddedToRematList = false; |
1309 | for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { |
1310 | auto It = DAG.LiveIns[I].find(Val: Reg); |
1311 | if (It != DAG.LiveIns[I].end() && !It->second.none()) { |
1312 | if (DAG.RegionsWithMinOcc[I]) { |
1313 | RematerializableInsts[I][Def] = UseI; |
1314 | AddedToRematList = true; |
1315 | } |
1316 | |
1317 | // Collect regions with rematerializable reg as live-in to avoid |
1318 | // searching later when updating RP. |
1319 | RematDefToLiveInRegions[Def].push_back(Elt: I); |
1320 | } |
1321 | } |
1322 | if (!AddedToRematList) |
1323 | RematDefToLiveInRegions.erase(Val: Def); |
1324 | } |
1325 | } |
1326 | |
1327 | bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST, |
1328 | const TargetInstrInfo *TII) { |
1329 | // Temporary copies of cached variables we will be modifying and replacing if |
1330 | // sinking succeeds. |
1331 | SmallVector< |
1332 | std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32> |
1333 | NewRegions; |
1334 | DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns; |
1335 | DenseMap<unsigned, GCNRegPressure> NewPressure; |
1336 | BitVector NewRescheduleRegions; |
1337 | LiveIntervals *LIS = DAG.LIS; |
1338 | |
1339 | NewRegions.resize(N: DAG.Regions.size()); |
1340 | NewRescheduleRegions.resize(N: DAG.Regions.size()); |
1341 | |
1342 | // Collect only regions that has a rematerializable def as a live-in. |
1343 | SmallSet<unsigned, 16> ImpactedRegions; |
1344 | for (const auto &It : RematDefToLiveInRegions) |
1345 | ImpactedRegions.insert(I: It.second.begin(), E: It.second.end()); |
1346 | |
1347 | // Make copies of register pressure and live-ins cache that will be updated |
1348 | // as we rematerialize. |
1349 | for (auto Idx : ImpactedRegions) { |
1350 | NewPressure[Idx] = DAG.Pressure[Idx]; |
1351 | NewLiveIns[Idx] = DAG.LiveIns[Idx]; |
1352 | } |
1353 | NewRegions = DAG.Regions; |
1354 | NewRescheduleRegions.reset(); |
1355 | |
1356 | DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef; |
1357 | bool Improved = false; |
1358 | for (auto I : ImpactedRegions) { |
1359 | if (!DAG.RegionsWithMinOcc[I]) |
1360 | continue; |
1361 | |
1362 | Improved = false; |
1363 | int VGPRUsage = NewPressure[I].getVGPRNum(UnifiedVGPRFile: ST.hasGFX90AInsts()); |
1364 | int SGPRUsage = NewPressure[I].getSGPRNum(); |
1365 | |
1366 | // TODO: Handle occupancy drop due to AGPR and SGPR. |
1367 | // Check if cause of occupancy drop is due to VGPR usage and not SGPR. |
1368 | if (ST.getOccupancyWithNumSGPRs(SGPRs: SGPRUsage) == DAG.MinOccupancy) |
1369 | break; |
1370 | |
1371 | // The occupancy of this region could have been improved by a previous |
1372 | // iteration's sinking of defs. |
1373 | if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) { |
1374 | NewRescheduleRegions[I] = true; |
1375 | Improved = true; |
1376 | continue; |
1377 | } |
1378 | |
1379 | // First check if we have enough trivially rematerializable instructions to |
1380 | // improve occupancy. Optimistically assume all instructions we are able to |
1381 | // sink decreased RP. |
1382 | int TotalSinkableRegs = 0; |
1383 | for (const auto &It : RematerializableInsts[I]) { |
1384 | MachineInstr *Def = It.first; |
1385 | Register DefReg = Def->getOperand(i: 0).getReg(); |
1386 | TotalSinkableRegs += |
1387 | SIRegisterInfo::getNumCoveredRegs(LM: NewLiveIns[I][DefReg]); |
1388 | } |
1389 | int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs; |
1390 | unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRs: VGPRsAfterSink); |
1391 | // If in the most optimistic scenario, we cannot improve occupancy, then do |
1392 | // not attempt to sink any instructions. |
1393 | if (OptimisticOccupancy <= DAG.MinOccupancy) |
1394 | break; |
1395 | |
1396 | unsigned ImproveOccupancy = 0; |
1397 | SmallVector<MachineInstr *, 4> SinkedDefs; |
1398 | for (auto &It : RematerializableInsts[I]) { |
1399 | MachineInstr *Def = It.first; |
1400 | MachineBasicBlock::iterator InsertPos = |
1401 | MachineBasicBlock::iterator(It.second); |
1402 | Register Reg = Def->getOperand(i: 0).getReg(); |
1403 | // Rematerialize MI to its use block. Since we are only rematerializing |
1404 | // instructions that do not have any virtual reg uses, we do not need to |
1405 | // call LiveRangeEdit::allUsesAvailableAt() and |
1406 | // LiveRangeEdit::canRematerializeAt(). |
1407 | TII->reMaterialize(MBB&: *InsertPos->getParent(), MI: InsertPos, DestReg: Reg, |
1408 | SubIdx: Def->getOperand(i: 0).getSubReg(), Orig: *Def, TRI: *DAG.TRI); |
1409 | MachineInstr *NewMI = &*std::prev(x: InsertPos); |
1410 | LIS->InsertMachineInstrInMaps(MI&: *NewMI); |
1411 | LIS->removeInterval(Reg); |
1412 | LIS->createAndComputeVirtRegInterval(Reg); |
1413 | InsertedMIToOldDef[NewMI] = Def; |
1414 | |
1415 | // Update region boundaries in scheduling region we sinked from since we |
1416 | // may sink an instruction that was at the beginning or end of its region |
1417 | DAG.updateRegionBoundaries(RegionBoundaries&: NewRegions, MI: Def, /*NewMI =*/nullptr, |
1418 | /*Removing =*/true); |
1419 | |
1420 | // Update region boundaries in region we sinked to. |
1421 | DAG.updateRegionBoundaries(RegionBoundaries&: NewRegions, MI: InsertPos, NewMI); |
1422 | |
1423 | LaneBitmask PrevMask = NewLiveIns[I][Reg]; |
1424 | // FIXME: Also update cached pressure for where the def was sinked from. |
1425 | // Update RP for all regions that has this reg as a live-in and remove |
1426 | // the reg from all regions as a live-in. |
1427 | for (auto Idx : RematDefToLiveInRegions[Def]) { |
1428 | NewLiveIns[Idx].erase(Val: Reg); |
1429 | if (InsertPos->getParent() != DAG.Regions[Idx].first->getParent()) { |
1430 | // Def is live-through and not used in this block. |
1431 | NewPressure[Idx].inc(Reg, PrevMask, NewMask: LaneBitmask::getNone(), MRI: DAG.MRI); |
1432 | } else { |
1433 | // Def is used and rematerialized into this block. |
1434 | GCNDownwardRPTracker RPT(*LIS); |
1435 | auto *NonDbgMI = &*skipDebugInstructionsForward( |
1436 | It: NewRegions[Idx].first, End: NewRegions[Idx].second); |
1437 | RPT.reset(MI: *NonDbgMI, LiveRegs: &NewLiveIns[Idx]); |
1438 | RPT.advance(End: NewRegions[Idx].second); |
1439 | NewPressure[Idx] = RPT.moveMaxPressure(); |
1440 | } |
1441 | } |
1442 | |
1443 | SinkedDefs.push_back(Elt: Def); |
1444 | ImproveOccupancy = NewPressure[I].getOccupancy(ST); |
1445 | if (ImproveOccupancy > DAG.MinOccupancy) |
1446 | break; |
1447 | } |
1448 | |
1449 | // Remove defs we just sinked from all regions' list of sinkable defs |
1450 | for (auto &Def : SinkedDefs) |
1451 | for (auto TrackedIdx : RematDefToLiveInRegions[Def]) |
1452 | RematerializableInsts[TrackedIdx].erase(Key: Def); |
1453 | |
1454 | if (ImproveOccupancy <= DAG.MinOccupancy) |
1455 | break; |
1456 | |
1457 | NewRescheduleRegions[I] = true; |
1458 | Improved = true; |
1459 | } |
1460 | |
1461 | if (!Improved) { |
1462 | // Occupancy was not improved for all regions that were at MinOccupancy. |
1463 | // Undo sinking and remove newly rematerialized instructions. |
1464 | for (auto &Entry : InsertedMIToOldDef) { |
1465 | MachineInstr *MI = Entry.first; |
1466 | MachineInstr *OldMI = Entry.second; |
1467 | Register Reg = MI->getOperand(i: 0).getReg(); |
1468 | LIS->RemoveMachineInstrFromMaps(MI&: *MI); |
1469 | MI->eraseFromParent(); |
1470 | OldMI->clearRegisterDeads(Reg); |
1471 | LIS->removeInterval(Reg); |
1472 | LIS->createAndComputeVirtRegInterval(Reg); |
1473 | } |
1474 | return false; |
1475 | } |
1476 | |
1477 | // Occupancy was improved for all regions. |
1478 | for (auto &Entry : InsertedMIToOldDef) { |
1479 | MachineInstr *MI = Entry.first; |
1480 | MachineInstr *OldMI = Entry.second; |
1481 | |
1482 | // Remove OldMI from BBLiveInMap since we are sinking it from its MBB. |
1483 | DAG.BBLiveInMap.erase(Val: OldMI); |
1484 | |
1485 | // Remove OldMI and update LIS |
1486 | Register Reg = MI->getOperand(i: 0).getReg(); |
1487 | LIS->RemoveMachineInstrFromMaps(MI&: *OldMI); |
1488 | OldMI->eraseFromParent(); |
1489 | LIS->removeInterval(Reg); |
1490 | LIS->createAndComputeVirtRegInterval(Reg); |
1491 | } |
1492 | |
1493 | // Update live-ins, register pressure, and regions caches. |
1494 | for (auto Idx : ImpactedRegions) { |
1495 | DAG.LiveIns[Idx] = NewLiveIns[Idx]; |
1496 | DAG.Pressure[Idx] = NewPressure[Idx]; |
1497 | DAG.MBBLiveIns.erase(Val: DAG.Regions[Idx].first->getParent()); |
1498 | } |
1499 | DAG.Regions = NewRegions; |
1500 | DAG.RescheduleRegions = NewRescheduleRegions; |
1501 | |
1502 | SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); |
1503 | MFI.increaseOccupancy(MF, Limit: ++DAG.MinOccupancy); |
1504 | |
1505 | return true; |
1506 | } |
1507 | |
1508 | // Copied from MachineLICM |
1509 | bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) { |
1510 | if (!DAG.TII->isTriviallyReMaterializable(MI)) |
1511 | return false; |
1512 | |
1513 | for (const MachineOperand &MO : MI.all_uses()) |
1514 | if (MO.getReg().isVirtual()) |
1515 | return false; |
1516 | |
1517 | return true; |
1518 | } |
1519 | |
1520 | // When removing, we will have to check both beginning and ending of the region. |
1521 | // When inserting, we will only have to check if we are inserting NewMI in front |
1522 | // of a scheduling region and do not need to check the ending since we will only |
1523 | // ever be inserting before an already existing MI. |
1524 | void GCNScheduleDAGMILive::updateRegionBoundaries( |
1525 | SmallVectorImpl<std::pair<MachineBasicBlock::iterator, |
1526 | MachineBasicBlock::iterator>> &RegionBoundaries, |
1527 | MachineBasicBlock::iterator MI, MachineInstr *NewMI, bool Removing) { |
1528 | unsigned I = 0, E = RegionBoundaries.size(); |
1529 | // Search for first region of the block where MI is located |
1530 | while (I != E && MI->getParent() != RegionBoundaries[I].first->getParent()) |
1531 | ++I; |
1532 | |
1533 | for (; I != E; ++I) { |
1534 | if (MI->getParent() != RegionBoundaries[I].first->getParent()) |
1535 | return; |
1536 | |
1537 | if (Removing && MI == RegionBoundaries[I].first && |
1538 | MI == RegionBoundaries[I].second) { |
1539 | // MI is in a region with size 1, after removing, the region will be |
1540 | // size 0, set RegionBegin and RegionEnd to pass end of block iterator. |
1541 | RegionBoundaries[I] = |
1542 | std::pair(MI->getParent()->end(), MI->getParent()->end()); |
1543 | return; |
1544 | } |
1545 | if (MI == RegionBoundaries[I].first) { |
1546 | if (Removing) |
1547 | RegionBoundaries[I] = |
1548 | std::pair(std::next(x: MI), RegionBoundaries[I].second); |
1549 | else |
1550 | // Inserted NewMI in front of region, set new RegionBegin to NewMI |
1551 | RegionBoundaries[I] = std::pair(MachineBasicBlock::iterator(NewMI), |
1552 | RegionBoundaries[I].second); |
1553 | return; |
1554 | } |
1555 | if (Removing && MI == RegionBoundaries[I].second) { |
1556 | RegionBoundaries[I] = std::pair(RegionBoundaries[I].first, std::prev(x: MI)); |
1557 | return; |
1558 | } |
1559 | } |
1560 | } |
1561 | |
1562 | static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) { |
1563 | return std::any_of( |
1564 | first: DAG->begin(), last: DAG->end(), pred: [](MachineBasicBlock::iterator MI) { |
1565 | unsigned Opc = MI->getOpcode(); |
1566 | return Opc == AMDGPU::SCHED_GROUP_BARRIER || Opc == AMDGPU::IGLP_OPT; |
1567 | }); |
1568 | } |
1569 | |
1570 | GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive( |
1571 | MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S, |
1572 | bool RemoveKillFlags) |
1573 | : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {} |
1574 | |
1575 | void GCNPostScheduleDAGMILive::schedule() { |
1576 | HasIGLPInstrs = hasIGLPInstrs(DAG: this); |
1577 | if (HasIGLPInstrs) { |
1578 | SavedMutations.clear(); |
1579 | SavedMutations.swap(x&: Mutations); |
1580 | addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::PostRA)); |
1581 | } |
1582 | |
1583 | ScheduleDAGMI::schedule(); |
1584 | } |
1585 | |
1586 | void GCNPostScheduleDAGMILive::finalizeSchedule() { |
1587 | if (HasIGLPInstrs) |
1588 | SavedMutations.swap(x&: Mutations); |
1589 | |
1590 | ScheduleDAGMI::finalizeSchedule(); |
1591 | } |
1592 | |