1 | //===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// R600 Machine Scheduler interface |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "R600MachineScheduler.h" |
15 | #include "MCTargetDesc/R600MCTargetDesc.h" |
16 | #include "R600Subtarget.h" |
17 | |
18 | using namespace llvm; |
19 | |
20 | #define DEBUG_TYPE "machine-scheduler" |
21 | |
22 | void R600SchedStrategy::initialize(ScheduleDAGMI *dag) { |
23 | assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness" ); |
24 | DAG = static_cast<ScheduleDAGMILive*>(dag); |
25 | const R600Subtarget &ST = DAG->MF.getSubtarget<R600Subtarget>(); |
26 | TII = static_cast<const R600InstrInfo*>(DAG->TII); |
27 | TRI = static_cast<const R600RegisterInfo*>(DAG->TRI); |
28 | VLIW5 = !ST.hasCaymanISA(); |
29 | MRI = &DAG->MRI; |
30 | CurInstKind = IDOther; |
31 | CurEmitted = 0; |
32 | OccupiedSlotsMask = 31; |
33 | InstKindLimit[IDAlu] = TII->getMaxAlusPerClause(); |
34 | InstKindLimit[IDOther] = 32; |
35 | InstKindLimit[IDFetch] = ST.getTexVTXClauseSize(); |
36 | AluInstCount = 0; |
37 | FetchInstCount = 0; |
38 | } |
39 | |
40 | void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc, |
41 | std::vector<SUnit *> &QDst) |
42 | { |
43 | llvm::append_range(C&: QDst, R&: QSrc); |
44 | QSrc.clear(); |
45 | } |
46 | |
47 | static unsigned getWFCountLimitedByGPR(unsigned GPRCount) { |
48 | assert (GPRCount && "GPRCount cannot be 0" ); |
49 | return 248 / GPRCount; |
50 | } |
51 | |
52 | SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) { |
53 | SUnit *SU = nullptr; |
54 | NextInstKind = IDOther; |
55 | |
56 | IsTopNode = false; |
57 | |
58 | // check if we might want to switch current clause type |
59 | bool AllowSwitchToAlu = (CurEmitted >= InstKindLimit[CurInstKind]) || |
60 | (Available[CurInstKind].empty()); |
61 | bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) && |
62 | (!Available[IDFetch].empty() || !Available[IDOther].empty()); |
63 | |
64 | if (CurInstKind == IDAlu && !Available[IDFetch].empty()) { |
65 | // We use the heuristic provided by AMD Accelerated Parallel Processing |
66 | // OpenCL Programming Guide : |
67 | // The approx. number of WF that allows TEX inst to hide ALU inst is : |
68 | // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU)) |
69 | float ALUFetchRationEstimate = |
70 | (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) / |
71 | (FetchInstCount + Available[IDFetch].size()); |
72 | if (ALUFetchRationEstimate == 0) { |
73 | AllowSwitchFromAlu = true; |
74 | } else { |
75 | unsigned NeededWF = 62.5f / ALUFetchRationEstimate; |
76 | LLVM_DEBUG(dbgs() << NeededWF << " approx. Wavefronts Required\n" ); |
77 | // We assume the local GPR requirements to be "dominated" by the requirement |
78 | // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and |
79 | // after TEX are indeed likely to consume or generate values from/for the |
80 | // TEX clause. |
81 | // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause |
82 | // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need |
83 | // one GPR) or TmXYZW = TnXYZW (need 2 GPR). |
84 | // (TODO : use RegisterPressure) |
85 | // If we are going too use too many GPR, we flush Fetch instruction to lower |
86 | // register pressure on 128 bits regs. |
87 | unsigned NearRegisterRequirement = 2 * Available[IDFetch].size(); |
88 | if (NeededWF > getWFCountLimitedByGPR(GPRCount: NearRegisterRequirement)) |
89 | AllowSwitchFromAlu = true; |
90 | } |
91 | } |
92 | |
93 | if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) || |
94 | (!AllowSwitchFromAlu && CurInstKind == IDAlu))) { |
95 | // try to pick ALU |
96 | SU = pickAlu(); |
97 | if (!SU && !PhysicalRegCopy.empty()) { |
98 | SU = PhysicalRegCopy.front(); |
99 | PhysicalRegCopy.erase(position: PhysicalRegCopy.begin()); |
100 | } |
101 | if (SU) { |
102 | if (CurEmitted >= InstKindLimit[IDAlu]) |
103 | CurEmitted = 0; |
104 | NextInstKind = IDAlu; |
105 | } |
106 | } |
107 | |
108 | if (!SU) { |
109 | // try to pick FETCH |
110 | SU = pickOther(QID: IDFetch); |
111 | if (SU) |
112 | NextInstKind = IDFetch; |
113 | } |
114 | |
115 | // try to pick other |
116 | if (!SU) { |
117 | SU = pickOther(QID: IDOther); |
118 | if (SU) |
119 | NextInstKind = IDOther; |
120 | } |
121 | |
122 | LLVM_DEBUG(if (SU) { |
123 | dbgs() << " ** Pick node **\n" ; |
124 | DAG->dumpNode(*SU); |
125 | } else { |
126 | dbgs() << "NO NODE \n" ; |
127 | for (const SUnit &S : DAG->SUnits) |
128 | if (!S.isScheduled) |
129 | DAG->dumpNode(S); |
130 | }); |
131 | |
132 | return SU; |
133 | } |
134 | |
135 | void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { |
136 | if (NextInstKind != CurInstKind) { |
137 | LLVM_DEBUG(dbgs() << "Instruction Type Switch\n" ); |
138 | if (NextInstKind != IDAlu) |
139 | OccupiedSlotsMask |= 31; |
140 | CurEmitted = 0; |
141 | CurInstKind = NextInstKind; |
142 | } |
143 | |
144 | if (CurInstKind == IDAlu) { |
145 | AluInstCount ++; |
146 | switch (getAluKind(SU)) { |
147 | case AluT_XYZW: |
148 | CurEmitted += 4; |
149 | break; |
150 | case AluDiscarded: |
151 | break; |
152 | default: { |
153 | ++CurEmitted; |
154 | for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(), |
155 | E = SU->getInstr()->operands_end(); It != E; ++It) { |
156 | MachineOperand &MO = *It; |
157 | if (MO.isReg() && MO.getReg() == R600::ALU_LITERAL_X) |
158 | ++CurEmitted; |
159 | } |
160 | } |
161 | } |
162 | } else { |
163 | ++CurEmitted; |
164 | } |
165 | |
166 | LLVM_DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n" ); |
167 | |
168 | if (CurInstKind != IDFetch) { |
169 | MoveUnits(QSrc&: Pending[IDFetch], QDst&: Available[IDFetch]); |
170 | } else |
171 | FetchInstCount++; |
172 | } |
173 | |
174 | static bool |
175 | isPhysicalRegCopy(MachineInstr *MI) { |
176 | if (MI->getOpcode() != R600::COPY) |
177 | return false; |
178 | |
179 | return !MI->getOperand(i: 1).getReg().isVirtual(); |
180 | } |
181 | |
182 | void R600SchedStrategy::releaseTopNode(SUnit *SU) { |
183 | LLVM_DEBUG(dbgs() << "Top Releasing " ; DAG->dumpNode(*SU)); |
184 | } |
185 | |
186 | void R600SchedStrategy::releaseBottomNode(SUnit *SU) { |
187 | LLVM_DEBUG(dbgs() << "Bottom Releasing " ; DAG->dumpNode(*SU)); |
188 | if (isPhysicalRegCopy(MI: SU->getInstr())) { |
189 | PhysicalRegCopy.push_back(x: SU); |
190 | return; |
191 | } |
192 | |
193 | int IK = getInstKind(SU); |
194 | |
195 | // There is no export clause, we can schedule one as soon as its ready |
196 | if (IK == IDOther) |
197 | Available[IDOther].push_back(x: SU); |
198 | else |
199 | Pending[IK].push_back(x: SU); |
200 | |
201 | } |
202 | |
203 | bool R600SchedStrategy::regBelongsToClass(Register Reg, |
204 | const TargetRegisterClass *RC) const { |
205 | if (!Reg.isVirtual()) { |
206 | return RC->contains(Reg); |
207 | } else { |
208 | return MRI->getRegClass(Reg) == RC; |
209 | } |
210 | } |
211 | |
212 | R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const { |
213 | MachineInstr *MI = SU->getInstr(); |
214 | |
215 | if (TII->isTransOnly(MI: *MI)) |
216 | return AluTrans; |
217 | |
218 | switch (MI->getOpcode()) { |
219 | case R600::PRED_X: |
220 | return AluPredX; |
221 | case R600::INTERP_PAIR_XY: |
222 | case R600::INTERP_PAIR_ZW: |
223 | case R600::INTERP_VEC_LOAD: |
224 | case R600::DOT_4: |
225 | return AluT_XYZW; |
226 | case R600::COPY: |
227 | if (MI->getOperand(i: 1).isUndef()) { |
228 | // MI will become a KILL, don't considers it in scheduling |
229 | return AluDiscarded; |
230 | } |
231 | break; |
232 | default: |
233 | break; |
234 | } |
235 | |
236 | // Does the instruction take a whole IG ? |
237 | // XXX: Is it possible to add a helper function in R600InstrInfo that can |
238 | // be used here and in R600PacketizerList::isSoloInstruction() ? |
239 | if(TII->isVector(MI: *MI) || |
240 | TII->isCubeOp(opcode: MI->getOpcode()) || |
241 | TII->isReductionOp(opcode: MI->getOpcode()) || |
242 | MI->getOpcode() == R600::GROUP_BARRIER) { |
243 | return AluT_XYZW; |
244 | } |
245 | |
246 | if (TII->isLDSInstr(Opcode: MI->getOpcode())) { |
247 | return AluT_X; |
248 | } |
249 | |
250 | // Is the result already assigned to a channel ? |
251 | unsigned DestSubReg = MI->getOperand(i: 0).getSubReg(); |
252 | switch (DestSubReg) { |
253 | case R600::sub0: |
254 | return AluT_X; |
255 | case R600::sub1: |
256 | return AluT_Y; |
257 | case R600::sub2: |
258 | return AluT_Z; |
259 | case R600::sub3: |
260 | return AluT_W; |
261 | default: |
262 | break; |
263 | } |
264 | |
265 | // Is the result already member of a X/Y/Z/W class ? |
266 | Register DestReg = MI->getOperand(i: 0).getReg(); |
267 | if (regBelongsToClass(Reg: DestReg, RC: &R600::R600_TReg32_XRegClass) || |
268 | regBelongsToClass(Reg: DestReg, RC: &R600::R600_AddrRegClass)) |
269 | return AluT_X; |
270 | if (regBelongsToClass(Reg: DestReg, RC: &R600::R600_TReg32_YRegClass)) |
271 | return AluT_Y; |
272 | if (regBelongsToClass(Reg: DestReg, RC: &R600::R600_TReg32_ZRegClass)) |
273 | return AluT_Z; |
274 | if (regBelongsToClass(Reg: DestReg, RC: &R600::R600_TReg32_WRegClass)) |
275 | return AluT_W; |
276 | if (regBelongsToClass(Reg: DestReg, RC: &R600::R600_Reg128RegClass)) |
277 | return AluT_XYZW; |
278 | |
279 | // LDS src registers cannot be used in the Trans slot. |
280 | if (TII->readsLDSSrcReg(MI: *MI)) |
281 | return AluT_XYZW; |
282 | |
283 | return AluAny; |
284 | } |
285 | |
286 | int R600SchedStrategy::getInstKind(SUnit* SU) { |
287 | int Opcode = SU->getInstr()->getOpcode(); |
288 | |
289 | if (TII->usesTextureCache(Opcode) || TII->usesVertexCache(Opcode)) |
290 | return IDFetch; |
291 | |
292 | if (TII->isALUInstr(Opcode)) { |
293 | return IDAlu; |
294 | } |
295 | |
296 | switch (Opcode) { |
297 | case R600::PRED_X: |
298 | case R600::COPY: |
299 | case R600::CONST_COPY: |
300 | case R600::INTERP_PAIR_XY: |
301 | case R600::INTERP_PAIR_ZW: |
302 | case R600::INTERP_VEC_LOAD: |
303 | case R600::DOT_4: |
304 | return IDAlu; |
305 | default: |
306 | return IDOther; |
307 | } |
308 | } |
309 | |
310 | SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) { |
311 | if (Q.empty()) |
312 | return nullptr; |
313 | for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend(); |
314 | It != E; ++It) { |
315 | SUnit *SU = *It; |
316 | InstructionsGroupCandidate.push_back(x: SU->getInstr()); |
317 | if (TII->fitsConstReadLimitations(InstructionsGroupCandidate) && |
318 | (!AnyALU || !TII->isVectorOnly(MI: *SU->getInstr()))) { |
319 | InstructionsGroupCandidate.pop_back(); |
320 | Q.erase(position: (It + 1).base()); |
321 | return SU; |
322 | } else { |
323 | InstructionsGroupCandidate.pop_back(); |
324 | } |
325 | } |
326 | return nullptr; |
327 | } |
328 | |
329 | void R600SchedStrategy::LoadAlu() { |
330 | std::vector<SUnit *> &QSrc = Pending[IDAlu]; |
331 | for (SUnit *SU : QSrc) { |
332 | AluKind AK = getAluKind(SU); |
333 | AvailableAlus[AK].push_back(x: SU); |
334 | } |
335 | QSrc.clear(); |
336 | } |
337 | |
338 | void R600SchedStrategy::PrepareNextSlot() { |
339 | LLVM_DEBUG(dbgs() << "New Slot\n" ); |
340 | assert(OccupiedSlotsMask && "Slot wasn't filled" ); |
341 | OccupiedSlotsMask = 0; |
342 | // if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS) |
343 | // OccupiedSlotsMask |= 16; |
344 | InstructionsGroupCandidate.clear(); |
345 | LoadAlu(); |
346 | } |
347 | |
348 | void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) { |
349 | int DstIndex = TII->getOperandIdx(MI->getOpcode(), R600::OpName::dst); |
350 | if (DstIndex == -1) { |
351 | return; |
352 | } |
353 | Register DestReg = MI->getOperand(i: DstIndex).getReg(); |
354 | // PressureRegister crashes if an operand is def and used in the same inst |
355 | // and we try to constraint its regclass |
356 | for (MachineInstr::mop_iterator It = MI->operands_begin(), |
357 | E = MI->operands_end(); It != E; ++It) { |
358 | MachineOperand &MO = *It; |
359 | if (MO.isReg() && !MO.isDef() && |
360 | MO.getReg() == DestReg) |
361 | return; |
362 | } |
363 | // Constrains the regclass of DestReg to assign it to Slot |
364 | switch (Slot) { |
365 | case 0: |
366 | MRI->constrainRegClass(Reg: DestReg, RC: &R600::R600_TReg32_XRegClass); |
367 | break; |
368 | case 1: |
369 | MRI->constrainRegClass(Reg: DestReg, RC: &R600::R600_TReg32_YRegClass); |
370 | break; |
371 | case 2: |
372 | MRI->constrainRegClass(Reg: DestReg, RC: &R600::R600_TReg32_ZRegClass); |
373 | break; |
374 | case 3: |
375 | MRI->constrainRegClass(Reg: DestReg, RC: &R600::R600_TReg32_WRegClass); |
376 | break; |
377 | } |
378 | } |
379 | |
380 | SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot, bool AnyAlu) { |
381 | static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W}; |
382 | SUnit *SlotedSU = PopInst(Q&: AvailableAlus[IndexToID[Slot]], AnyALU: AnyAlu); |
383 | if (SlotedSU) |
384 | return SlotedSU; |
385 | SUnit *UnslotedSU = PopInst(Q&: AvailableAlus[AluAny], AnyALU: AnyAlu); |
386 | if (UnslotedSU) |
387 | AssignSlot(MI: UnslotedSU->getInstr(), Slot); |
388 | return UnslotedSU; |
389 | } |
390 | |
391 | unsigned R600SchedStrategy::AvailablesAluCount() const { |
392 | return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() + |
393 | AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() + |
394 | AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() + |
395 | AvailableAlus[AluTrans].size() + AvailableAlus[AluDiscarded].size() + |
396 | AvailableAlus[AluPredX].size(); |
397 | } |
398 | |
399 | SUnit* R600SchedStrategy::pickAlu() { |
400 | while (AvailablesAluCount() || !Pending[IDAlu].empty()) { |
401 | if (!OccupiedSlotsMask) { |
402 | // Bottom up scheduling : predX must comes first |
403 | if (!AvailableAlus[AluPredX].empty()) { |
404 | OccupiedSlotsMask |= 31; |
405 | return PopInst(Q&: AvailableAlus[AluPredX], AnyALU: false); |
406 | } |
407 | // Flush physical reg copies (RA will discard them) |
408 | if (!AvailableAlus[AluDiscarded].empty()) { |
409 | OccupiedSlotsMask |= 31; |
410 | return PopInst(Q&: AvailableAlus[AluDiscarded], AnyALU: false); |
411 | } |
412 | // If there is a T_XYZW alu available, use it |
413 | if (!AvailableAlus[AluT_XYZW].empty()) { |
414 | OccupiedSlotsMask |= 15; |
415 | return PopInst(Q&: AvailableAlus[AluT_XYZW], AnyALU: false); |
416 | } |
417 | } |
418 | bool TransSlotOccupied = OccupiedSlotsMask & 16; |
419 | if (!TransSlotOccupied && VLIW5) { |
420 | if (!AvailableAlus[AluTrans].empty()) { |
421 | OccupiedSlotsMask |= 16; |
422 | return PopInst(Q&: AvailableAlus[AluTrans], AnyALU: false); |
423 | } |
424 | SUnit *SU = AttemptFillSlot(Slot: 3, AnyAlu: true); |
425 | if (SU) { |
426 | OccupiedSlotsMask |= 16; |
427 | return SU; |
428 | } |
429 | } |
430 | for (int Chan = 3; Chan > -1; --Chan) { |
431 | bool isOccupied = OccupiedSlotsMask & (1 << Chan); |
432 | if (!isOccupied) { |
433 | SUnit *SU = AttemptFillSlot(Slot: Chan, AnyAlu: false); |
434 | if (SU) { |
435 | OccupiedSlotsMask |= (1 << Chan); |
436 | InstructionsGroupCandidate.push_back(x: SU->getInstr()); |
437 | return SU; |
438 | } |
439 | } |
440 | } |
441 | PrepareNextSlot(); |
442 | } |
443 | return nullptr; |
444 | } |
445 | |
446 | SUnit* R600SchedStrategy::pickOther(int QID) { |
447 | SUnit *SU = nullptr; |
448 | std::vector<SUnit *> &AQ = Available[QID]; |
449 | |
450 | if (AQ.empty()) { |
451 | MoveUnits(QSrc&: Pending[QID], QDst&: AQ); |
452 | } |
453 | if (!AQ.empty()) { |
454 | SU = AQ.back(); |
455 | AQ.pop_back(); |
456 | } |
457 | return SU; |
458 | } |
459 | |