1 | //===- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies ---------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// Copies from VGPR to SGPR registers are illegal and the register coalescer |
11 | /// will sometimes generate these illegal copies in situations like this: |
12 | /// |
13 | /// Register Class <vsrc> is the union of <vgpr> and <sgpr> |
14 | /// |
15 | /// BB0: |
16 | /// %0 <sgpr> = SCALAR_INST |
17 | /// %1 <vsrc> = COPY %0 <sgpr> |
18 | /// ... |
19 | /// BRANCH %cond BB1, BB2 |
20 | /// BB1: |
21 | /// %2 <vgpr> = VECTOR_INST |
22 | /// %3 <vsrc> = COPY %2 <vgpr> |
23 | /// BB2: |
24 | /// %4 <vsrc> = PHI %1 <vsrc>, <%bb.0>, %3 <vrsc>, <%bb.1> |
25 | /// %5 <vgpr> = VECTOR_INST %4 <vsrc> |
26 | /// |
27 | /// |
28 | /// The coalescer will begin at BB0 and eliminate its copy, then the resulting |
29 | /// code will look like this: |
30 | /// |
31 | /// BB0: |
32 | /// %0 <sgpr> = SCALAR_INST |
33 | /// ... |
34 | /// BRANCH %cond BB1, BB2 |
35 | /// BB1: |
36 | /// %2 <vgpr> = VECTOR_INST |
37 | /// %3 <vsrc> = COPY %2 <vgpr> |
38 | /// BB2: |
39 | /// %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <vsrc>, <%bb.1> |
40 | /// %5 <vgpr> = VECTOR_INST %4 <sgpr> |
41 | /// |
42 | /// Now that the result of the PHI instruction is an SGPR, the register |
43 | /// allocator is now forced to constrain the register class of %3 to |
44 | /// <sgpr> so we end up with final code like this: |
45 | /// |
46 | /// BB0: |
47 | /// %0 <sgpr> = SCALAR_INST |
48 | /// ... |
49 | /// BRANCH %cond BB1, BB2 |
50 | /// BB1: |
51 | /// %2 <vgpr> = VECTOR_INST |
52 | /// %3 <sgpr> = COPY %2 <vgpr> |
53 | /// BB2: |
54 | /// %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <sgpr>, <%bb.1> |
55 | /// %5 <vgpr> = VECTOR_INST %4 <sgpr> |
56 | /// |
57 | /// Now this code contains an illegal copy from a VGPR to an SGPR. |
58 | /// |
59 | /// In order to avoid this problem, this pass searches for PHI instructions |
60 | /// which define a <vsrc> register and constrains its definition class to |
61 | /// <vgpr> if the user of the PHI's definition register is a vector instruction. |
62 | /// If the PHI's definition class is constrained to <vgpr> then the coalescer |
63 | /// will be unable to perform the COPY removal from the above example which |
64 | /// ultimately led to the creation of an illegal COPY. |
65 | //===----------------------------------------------------------------------===// |
66 | |
67 | #include "AMDGPU.h" |
68 | #include "GCNSubtarget.h" |
69 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
70 | #include "llvm/ADT/SetOperations.h" |
71 | #include "llvm/CodeGen/MachineDominators.h" |
72 | #include "llvm/InitializePasses.h" |
73 | #include "llvm/Target/TargetMachine.h" |
74 | |
75 | using namespace llvm; |
76 | |
77 | #define DEBUG_TYPE "si-fix-sgpr-copies" |
78 | |
79 | static cl::opt<bool> EnableM0Merge( |
80 | "amdgpu-enable-merge-m0" , |
81 | cl::desc("Merge and hoist M0 initializations" ), |
82 | cl::init(Val: true)); |
83 | |
84 | namespace { |
85 | |
86 | class V2SCopyInfo { |
87 | public: |
88 | // VGPR to SGPR copy being processed |
89 | MachineInstr *Copy; |
90 | // All SALU instructions reachable from this copy in SSA graph |
91 | SetVector<MachineInstr *> SChain; |
92 | // Number of SGPR to VGPR copies that are used to put the SALU computation |
93 | // results back to VALU. |
94 | unsigned NumSVCopies; |
95 | |
96 | unsigned Score; |
97 | // Actual count of v_readfirstlane_b32 |
98 | // which need to be inserted to keep SChain SALU |
99 | unsigned NumReadfirstlanes; |
100 | // Current score state. To speedup selection V2SCopyInfos for processing |
101 | bool NeedToBeConvertedToVALU = false; |
102 | // Unique ID. Used as a key for mapping to keep permanent order. |
103 | unsigned ID; |
104 | |
105 | // Count of another VGPR to SGPR copies that contribute to the |
106 | // current copy SChain |
107 | unsigned SiblingPenalty = 0; |
108 | SetVector<unsigned> Siblings; |
109 | V2SCopyInfo() : Copy(nullptr), ID(0){}; |
110 | V2SCopyInfo(unsigned Id, MachineInstr *C, unsigned Width) |
111 | : Copy(C), NumSVCopies(0), NumReadfirstlanes(Width / 32), ID(Id){}; |
112 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
113 | void dump() { |
114 | dbgs() << ID << " : " << *Copy << "\n\tS:" << SChain.size() |
115 | << "\n\tSV:" << NumSVCopies << "\n\tSP: " << SiblingPenalty |
116 | << "\nScore: " << Score << "\n" ; |
117 | } |
118 | #endif |
119 | }; |
120 | |
121 | class SIFixSGPRCopies : public MachineFunctionPass { |
122 | MachineDominatorTree *MDT; |
123 | SmallVector<MachineInstr*, 4> SCCCopies; |
124 | SmallVector<MachineInstr*, 4> RegSequences; |
125 | SmallVector<MachineInstr*, 4> PHINodes; |
126 | SmallVector<MachineInstr*, 4> S2VCopies; |
127 | unsigned NextVGPRToSGPRCopyID; |
128 | MapVector<unsigned, V2SCopyInfo> V2SCopies; |
129 | DenseMap<MachineInstr *, SetVector<unsigned>> SiblingPenalty; |
130 | |
131 | public: |
132 | static char ID; |
133 | |
134 | MachineRegisterInfo *MRI; |
135 | const SIRegisterInfo *TRI; |
136 | const SIInstrInfo *TII; |
137 | |
138 | SIFixSGPRCopies() : MachineFunctionPass(ID), NextVGPRToSGPRCopyID(0) {} |
139 | |
140 | bool runOnMachineFunction(MachineFunction &MF) override; |
141 | void fixSCCCopies(MachineFunction &MF); |
142 | void prepareRegSequenceAndPHIs(MachineFunction &MF); |
143 | unsigned getNextVGPRToSGPRCopyId() { return ++NextVGPRToSGPRCopyID; } |
144 | bool needToBeConvertedToVALU(V2SCopyInfo *I); |
145 | void analyzeVGPRToSGPRCopy(MachineInstr *MI); |
146 | void lowerVGPR2SGPRCopies(MachineFunction &MF); |
147 | // Handles copies which source register is: |
148 | // 1. Physical register |
149 | // 2. AGPR |
150 | // 3. Defined by the instruction the merely moves the immediate |
151 | bool lowerSpecialCase(MachineInstr &MI, MachineBasicBlock::iterator &I); |
152 | |
153 | void processPHINode(MachineInstr &MI); |
154 | |
155 | // Check if MO is an immediate materialized into a VGPR, and if so replace it |
156 | // with an SGPR immediate. The VGPR immediate is also deleted if it does not |
157 | // have any other uses. |
158 | bool tryMoveVGPRConstToSGPR(MachineOperand &MO, Register NewDst, |
159 | MachineBasicBlock *BlockToInsertTo, |
160 | MachineBasicBlock::iterator PointToInsertTo); |
161 | |
162 | StringRef getPassName() const override { return "SI Fix SGPR copies" ; } |
163 | |
164 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
165 | AU.addRequired<MachineDominatorTree>(); |
166 | AU.addPreserved<MachineDominatorTree>(); |
167 | AU.setPreservesCFG(); |
168 | MachineFunctionPass::getAnalysisUsage(AU); |
169 | } |
170 | }; |
171 | |
172 | } // end anonymous namespace |
173 | |
174 | INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE, |
175 | "SI Fix SGPR copies" , false, false) |
176 | INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) |
177 | INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE, |
178 | "SI Fix SGPR copies" , false, false) |
179 | |
180 | char SIFixSGPRCopies::ID = 0; |
181 | |
182 | char &llvm::SIFixSGPRCopiesID = SIFixSGPRCopies::ID; |
183 | |
184 | FunctionPass *llvm::createSIFixSGPRCopiesPass() { |
185 | return new SIFixSGPRCopies(); |
186 | } |
187 | |
188 | static std::pair<const TargetRegisterClass *, const TargetRegisterClass *> |
189 | getCopyRegClasses(const MachineInstr &Copy, |
190 | const SIRegisterInfo &TRI, |
191 | const MachineRegisterInfo &MRI) { |
192 | Register DstReg = Copy.getOperand(i: 0).getReg(); |
193 | Register SrcReg = Copy.getOperand(i: 1).getReg(); |
194 | |
195 | const TargetRegisterClass *SrcRC = SrcReg.isVirtual() |
196 | ? MRI.getRegClass(Reg: SrcReg) |
197 | : TRI.getPhysRegBaseClass(SrcReg); |
198 | |
199 | // We don't really care about the subregister here. |
200 | // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg()); |
201 | |
202 | const TargetRegisterClass *DstRC = DstReg.isVirtual() |
203 | ? MRI.getRegClass(Reg: DstReg) |
204 | : TRI.getPhysRegBaseClass(DstReg); |
205 | |
206 | return std::pair(SrcRC, DstRC); |
207 | } |
208 | |
209 | static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC, |
210 | const TargetRegisterClass *DstRC, |
211 | const SIRegisterInfo &TRI) { |
212 | return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(RC: DstRC) && |
213 | TRI.hasVectorRegisters(RC: SrcRC); |
214 | } |
215 | |
216 | static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, |
217 | const TargetRegisterClass *DstRC, |
218 | const SIRegisterInfo &TRI) { |
219 | return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(RC: SrcRC) && |
220 | TRI.hasVectorRegisters(RC: DstRC); |
221 | } |
222 | |
223 | static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI, |
224 | const SIRegisterInfo *TRI, |
225 | const SIInstrInfo *TII) { |
226 | MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); |
227 | auto &Src = MI.getOperand(i: 1); |
228 | Register DstReg = MI.getOperand(i: 0).getReg(); |
229 | Register SrcReg = Src.getReg(); |
230 | if (!SrcReg.isVirtual() || !DstReg.isVirtual()) |
231 | return false; |
232 | |
233 | for (const auto &MO : MRI.reg_nodbg_operands(Reg: DstReg)) { |
234 | const auto *UseMI = MO.getParent(); |
235 | if (UseMI == &MI) |
236 | continue; |
237 | if (MO.isDef() || UseMI->getParent() != MI.getParent() || |
238 | UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) |
239 | return false; |
240 | |
241 | unsigned OpIdx = MO.getOperandNo(); |
242 | if (OpIdx >= UseMI->getDesc().getNumOperands() || |
243 | !TII->isOperandLegal(MI: *UseMI, OpIdx, MO: &Src)) |
244 | return false; |
245 | } |
246 | // Change VGPR to SGPR destination. |
247 | MRI.setRegClass(Reg: DstReg, RC: TRI->getEquivalentSGPRClass(VRC: MRI.getRegClass(Reg: DstReg))); |
248 | return true; |
249 | } |
250 | |
251 | // Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE. |
252 | // |
253 | // SGPRx = ... |
254 | // SGPRy = REG_SEQUENCE SGPRx, sub0 ... |
255 | // VGPRz = COPY SGPRy |
256 | // |
257 | // ==> |
258 | // |
259 | // VGPRx = COPY SGPRx |
260 | // VGPRz = REG_SEQUENCE VGPRx, sub0 |
261 | // |
262 | // This exposes immediate folding opportunities when materializing 64-bit |
263 | // immediates. |
264 | static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, |
265 | const SIRegisterInfo *TRI, |
266 | const SIInstrInfo *TII, |
267 | MachineRegisterInfo &MRI) { |
268 | assert(MI.isRegSequence()); |
269 | |
270 | Register DstReg = MI.getOperand(i: 0).getReg(); |
271 | if (!TRI->isSGPRClass(RC: MRI.getRegClass(Reg: DstReg))) |
272 | return false; |
273 | |
274 | if (!MRI.hasOneUse(RegNo: DstReg)) |
275 | return false; |
276 | |
277 | MachineInstr &CopyUse = *MRI.use_instr_begin(RegNo: DstReg); |
278 | if (!CopyUse.isCopy()) |
279 | return false; |
280 | |
281 | // It is illegal to have vreg inputs to a physreg defining reg_sequence. |
282 | if (CopyUse.getOperand(i: 0).getReg().isPhysical()) |
283 | return false; |
284 | |
285 | const TargetRegisterClass *SrcRC, *DstRC; |
286 | std::tie(args&: SrcRC, args&: DstRC) = getCopyRegClasses(Copy: CopyUse, TRI: *TRI, MRI); |
287 | |
288 | if (!isSGPRToVGPRCopy(SrcRC, DstRC, TRI: *TRI)) |
289 | return false; |
290 | |
291 | if (tryChangeVGPRtoSGPRinCopy(MI&: CopyUse, TRI, TII)) |
292 | return true; |
293 | |
294 | // TODO: Could have multiple extracts? |
295 | unsigned SubReg = CopyUse.getOperand(i: 1).getSubReg(); |
296 | if (SubReg != AMDGPU::NoSubRegister) |
297 | return false; |
298 | |
299 | MRI.setRegClass(Reg: DstReg, RC: DstRC); |
300 | |
301 | // SGPRx = ... |
302 | // SGPRy = REG_SEQUENCE SGPRx, sub0 ... |
303 | // VGPRz = COPY SGPRy |
304 | |
305 | // => |
306 | // VGPRx = COPY SGPRx |
307 | // VGPRz = REG_SEQUENCE VGPRx, sub0 |
308 | |
309 | MI.getOperand(i: 0).setReg(CopyUse.getOperand(i: 0).getReg()); |
310 | bool IsAGPR = TRI->isAGPRClass(RC: DstRC); |
311 | |
312 | for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { |
313 | const TargetRegisterClass *SrcRC = |
314 | TRI->getRegClassForOperandReg(MRI, MO: MI.getOperand(i: I)); |
315 | assert(TRI->isSGPRClass(SrcRC) && |
316 | "Expected SGPR REG_SEQUENCE to only have SGPR inputs" ); |
317 | const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SRC: SrcRC); |
318 | |
319 | Register TmpReg = MRI.createVirtualRegister(RegClass: NewSrcRC); |
320 | |
321 | BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), |
322 | TmpReg) |
323 | .add(MI.getOperand(i: I)); |
324 | |
325 | if (IsAGPR) { |
326 | const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SRC: SrcRC); |
327 | Register TmpAReg = MRI.createVirtualRegister(RegClass: NewSrcRC); |
328 | unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ? |
329 | AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::COPY; |
330 | BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(Opc), |
331 | TmpAReg) |
332 | .addReg(TmpReg, RegState::Kill); |
333 | TmpReg = TmpAReg; |
334 | } |
335 | |
336 | MI.getOperand(i: I).setReg(TmpReg); |
337 | } |
338 | |
339 | CopyUse.eraseFromParent(); |
340 | return true; |
341 | } |
342 | |
343 | static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy, |
344 | const MachineInstr *MoveImm, |
345 | const SIInstrInfo *TII, |
346 | unsigned &SMovOp, |
347 | int64_t &Imm) { |
348 | if (Copy->getOpcode() != AMDGPU::COPY) |
349 | return false; |
350 | |
351 | if (!MoveImm->isMoveImmediate()) |
352 | return false; |
353 | |
354 | const MachineOperand *ImmOp = |
355 | TII->getNamedOperand(*MoveImm, AMDGPU::OpName::src0); |
356 | if (!ImmOp->isImm()) |
357 | return false; |
358 | |
359 | // FIXME: Handle copies with sub-regs. |
360 | if (Copy->getOperand(i: 1).getSubReg()) |
361 | return false; |
362 | |
363 | switch (MoveImm->getOpcode()) { |
364 | default: |
365 | return false; |
366 | case AMDGPU::V_MOV_B32_e32: |
367 | SMovOp = AMDGPU::S_MOV_B32; |
368 | break; |
369 | case AMDGPU::V_MOV_B64_PSEUDO: |
370 | SMovOp = AMDGPU::S_MOV_B64_IMM_PSEUDO; |
371 | break; |
372 | } |
373 | Imm = ImmOp->getImm(); |
374 | return true; |
375 | } |
376 | |
377 | template <class UnaryPredicate> |
378 | bool searchPredecessors(const MachineBasicBlock *MBB, |
379 | const MachineBasicBlock *CutOff, |
380 | UnaryPredicate Predicate) { |
381 | if (MBB == CutOff) |
382 | return false; |
383 | |
384 | DenseSet<const MachineBasicBlock *> Visited; |
385 | SmallVector<MachineBasicBlock *, 4> Worklist(MBB->predecessors()); |
386 | |
387 | while (!Worklist.empty()) { |
388 | MachineBasicBlock *MBB = Worklist.pop_back_val(); |
389 | |
390 | if (!Visited.insert(V: MBB).second) |
391 | continue; |
392 | if (MBB == CutOff) |
393 | continue; |
394 | if (Predicate(MBB)) |
395 | return true; |
396 | |
397 | Worklist.append(in_start: MBB->pred_begin(), in_end: MBB->pred_end()); |
398 | } |
399 | |
400 | return false; |
401 | } |
402 | |
403 | // Checks if there is potential path From instruction To instruction. |
404 | // If CutOff is specified and it sits in between of that path we ignore |
405 | // a higher portion of the path and report it is not reachable. |
406 | static bool isReachable(const MachineInstr *From, |
407 | const MachineInstr *To, |
408 | const MachineBasicBlock *CutOff, |
409 | MachineDominatorTree &MDT) { |
410 | if (MDT.dominates(A: From, B: To)) |
411 | return true; |
412 | |
413 | const MachineBasicBlock *MBBFrom = From->getParent(); |
414 | const MachineBasicBlock *MBBTo = To->getParent(); |
415 | |
416 | // Do predecessor search. |
417 | // We should almost never get here since we do not usually produce M0 stores |
418 | // other than -1. |
419 | return searchPredecessors(MBB: MBBTo, CutOff, Predicate: [MBBFrom] |
420 | (const MachineBasicBlock *MBB) { return MBB == MBBFrom; }); |
421 | } |
422 | |
423 | // Return the first non-prologue instruction in the block. |
424 | static MachineBasicBlock::iterator |
425 | getFirstNonPrologue(MachineBasicBlock *MBB, const TargetInstrInfo *TII) { |
426 | MachineBasicBlock::iterator I = MBB->getFirstNonPHI(); |
427 | while (I != MBB->end() && TII->isBasicBlockPrologue(MI: *I)) |
428 | ++I; |
429 | |
430 | return I; |
431 | } |
432 | |
433 | // Hoist and merge identical SGPR initializations into a common predecessor. |
434 | // This is intended to combine M0 initializations, but can work with any |
435 | // SGPR. A VGPR cannot be processed since we cannot guarantee vector |
436 | // executioon. |
437 | static bool hoistAndMergeSGPRInits(unsigned Reg, |
438 | const MachineRegisterInfo &MRI, |
439 | const TargetRegisterInfo *TRI, |
440 | MachineDominatorTree &MDT, |
441 | const TargetInstrInfo *TII) { |
442 | // List of inits by immediate value. |
443 | using InitListMap = std::map<unsigned, std::list<MachineInstr *>>; |
444 | InitListMap Inits; |
445 | // List of clobbering instructions. |
446 | SmallVector<MachineInstr*, 8> Clobbers; |
447 | // List of instructions marked for deletion. |
448 | SmallSet<MachineInstr*, 8> MergedInstrs; |
449 | |
450 | bool Changed = false; |
451 | |
452 | for (auto &MI : MRI.def_instructions(Reg)) { |
453 | MachineOperand *Imm = nullptr; |
454 | for (auto &MO : MI.operands()) { |
455 | if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) || |
456 | (!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) { |
457 | Imm = nullptr; |
458 | break; |
459 | } else if (MO.isImm()) |
460 | Imm = &MO; |
461 | } |
462 | if (Imm) |
463 | Inits[Imm->getImm()].push_front(x: &MI); |
464 | else |
465 | Clobbers.push_back(Elt: &MI); |
466 | } |
467 | |
468 | for (auto &Init : Inits) { |
469 | auto &Defs = Init.second; |
470 | |
471 | for (auto I1 = Defs.begin(), E = Defs.end(); I1 != E; ) { |
472 | MachineInstr *MI1 = *I1; |
473 | |
474 | for (auto I2 = std::next(x: I1); I2 != E; ) { |
475 | MachineInstr *MI2 = *I2; |
476 | |
477 | // Check any possible interference |
478 | auto interferes = [&](MachineBasicBlock::iterator From, |
479 | MachineBasicBlock::iterator To) -> bool { |
480 | |
481 | assert(MDT.dominates(&*To, &*From)); |
482 | |
483 | auto interferes = [&MDT, From, To](MachineInstr* &Clobber) -> bool { |
484 | const MachineBasicBlock *MBBFrom = From->getParent(); |
485 | const MachineBasicBlock *MBBTo = To->getParent(); |
486 | bool MayClobberFrom = isReachable(From: Clobber, To: &*From, CutOff: MBBTo, MDT); |
487 | bool MayClobberTo = isReachable(From: Clobber, To: &*To, CutOff: MBBTo, MDT); |
488 | if (!MayClobberFrom && !MayClobberTo) |
489 | return false; |
490 | if ((MayClobberFrom && !MayClobberTo) || |
491 | (!MayClobberFrom && MayClobberTo)) |
492 | return true; |
493 | // Both can clobber, this is not an interference only if both are |
494 | // dominated by Clobber and belong to the same block or if Clobber |
495 | // properly dominates To, given that To >> From, so it dominates |
496 | // both and located in a common dominator. |
497 | return !((MBBFrom == MBBTo && |
498 | MDT.dominates(A: Clobber, B: &*From) && |
499 | MDT.dominates(A: Clobber, B: &*To)) || |
500 | MDT.properlyDominates(A: Clobber->getParent(), B: MBBTo)); |
501 | }; |
502 | |
503 | return (llvm::any_of(Range&: Clobbers, P: interferes)) || |
504 | (llvm::any_of(Range&: Inits, P: [&](InitListMap::value_type &C) { |
505 | return C.first != Init.first && |
506 | llvm::any_of(Range&: C.second, P: interferes); |
507 | })); |
508 | }; |
509 | |
510 | if (MDT.dominates(A: MI1, B: MI2)) { |
511 | if (!interferes(MI2, MI1)) { |
512 | LLVM_DEBUG(dbgs() |
513 | << "Erasing from " |
514 | << printMBBReference(*MI2->getParent()) << " " << *MI2); |
515 | MergedInstrs.insert(Ptr: MI2); |
516 | Changed = true; |
517 | ++I2; |
518 | continue; |
519 | } |
520 | } else if (MDT.dominates(A: MI2, B: MI1)) { |
521 | if (!interferes(MI1, MI2)) { |
522 | LLVM_DEBUG(dbgs() |
523 | << "Erasing from " |
524 | << printMBBReference(*MI1->getParent()) << " " << *MI1); |
525 | MergedInstrs.insert(Ptr: MI1); |
526 | Changed = true; |
527 | ++I1; |
528 | break; |
529 | } |
530 | } else { |
531 | auto *MBB = MDT.findNearestCommonDominator(A: MI1->getParent(), |
532 | B: MI2->getParent()); |
533 | if (!MBB) { |
534 | ++I2; |
535 | continue; |
536 | } |
537 | |
538 | MachineBasicBlock::iterator I = getFirstNonPrologue(MBB, TII); |
539 | if (!interferes(MI1, I) && !interferes(MI2, I)) { |
540 | LLVM_DEBUG(dbgs() |
541 | << "Erasing from " |
542 | << printMBBReference(*MI1->getParent()) << " " << *MI1 |
543 | << "and moving from " |
544 | << printMBBReference(*MI2->getParent()) << " to " |
545 | << printMBBReference(*I->getParent()) << " " << *MI2); |
546 | I->getParent()->splice(Where: I, Other: MI2->getParent(), From: MI2); |
547 | MergedInstrs.insert(Ptr: MI1); |
548 | Changed = true; |
549 | ++I1; |
550 | break; |
551 | } |
552 | } |
553 | ++I2; |
554 | } |
555 | ++I1; |
556 | } |
557 | } |
558 | |
559 | // Remove initializations that were merged into another. |
560 | for (auto &Init : Inits) { |
561 | auto &Defs = Init.second; |
562 | auto I = Defs.begin(); |
563 | while (I != Defs.end()) { |
564 | if (MergedInstrs.count(Ptr: *I)) { |
565 | (*I)->eraseFromParent(); |
566 | I = Defs.erase(position: I); |
567 | } else |
568 | ++I; |
569 | } |
570 | } |
571 | |
572 | // Try to schedule SGPR initializations as early as possible in the MBB. |
573 | for (auto &Init : Inits) { |
574 | auto &Defs = Init.second; |
575 | for (auto *MI : Defs) { |
576 | auto MBB = MI->getParent(); |
577 | MachineInstr &BoundaryMI = *getFirstNonPrologue(MBB, TII); |
578 | MachineBasicBlock::reverse_iterator B(BoundaryMI); |
579 | // Check if B should actually be a boundary. If not set the previous |
580 | // instruction as the boundary instead. |
581 | if (!TII->isBasicBlockPrologue(MI: *B)) |
582 | B++; |
583 | |
584 | auto R = std::next(x: MI->getReverseIterator()); |
585 | const unsigned Threshold = 50; |
586 | // Search until B or Threshold for a place to insert the initialization. |
587 | for (unsigned I = 0; R != B && I < Threshold; ++R, ++I) |
588 | if (R->readsRegister(Reg, TRI) || R->definesRegister(Reg, TRI) || |
589 | TII->isSchedulingBoundary(MI: *R, MBB, MF: *MBB->getParent())) |
590 | break; |
591 | |
592 | // Move to directly after R. |
593 | if (&*--R != MI) |
594 | MBB->splice(Where: *R, Other: MBB, From: MI); |
595 | } |
596 | } |
597 | |
598 | if (Changed) |
599 | MRI.clearKillFlags(Reg); |
600 | |
601 | return Changed; |
602 | } |
603 | |
604 | bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { |
605 | // Only need to run this in SelectionDAG path. |
606 | if (MF.getProperties().hasProperty( |
607 | P: MachineFunctionProperties::Property::Selected)) |
608 | return false; |
609 | |
610 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
611 | MRI = &MF.getRegInfo(); |
612 | TRI = ST.getRegisterInfo(); |
613 | TII = ST.getInstrInfo(); |
614 | MDT = &getAnalysis<MachineDominatorTree>(); |
615 | |
616 | |
617 | for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); |
618 | BI != BE; ++BI) { |
619 | MachineBasicBlock *MBB = &*BI; |
620 | for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; |
621 | ++I) { |
622 | MachineInstr &MI = *I; |
623 | |
624 | switch (MI.getOpcode()) { |
625 | default: |
626 | continue; |
627 | case AMDGPU::COPY: |
628 | case AMDGPU::WQM: |
629 | case AMDGPU::STRICT_WQM: |
630 | case AMDGPU::SOFT_WQM: |
631 | case AMDGPU::STRICT_WWM: { |
632 | const TargetRegisterClass *SrcRC, *DstRC; |
633 | std::tie(args&: SrcRC, args&: DstRC) = getCopyRegClasses(Copy: MI, TRI: *TRI, MRI: *MRI); |
634 | |
635 | if (isSGPRToVGPRCopy(SrcRC, DstRC, TRI: *TRI)) { |
636 | // Since VGPR to SGPR copies affect VGPR to SGPR copy |
637 | // score and, hence the lowering decision, let's try to get rid of |
638 | // them as early as possible |
639 | if (tryChangeVGPRtoSGPRinCopy(MI, TRI, TII)) |
640 | continue; |
641 | |
642 | // Collect those not changed to try them after VGPR to SGPR copies |
643 | // lowering as there will be more opportunities. |
644 | S2VCopies.push_back(Elt: &MI); |
645 | } |
646 | if (!isVGPRToSGPRCopy(SrcRC, DstRC, TRI: *TRI)) |
647 | continue; |
648 | if (lowerSpecialCase(MI, I)) |
649 | continue; |
650 | |
651 | analyzeVGPRToSGPRCopy(MI: &MI); |
652 | |
653 | break; |
654 | } |
655 | case AMDGPU::INSERT_SUBREG: |
656 | case AMDGPU::PHI: |
657 | case AMDGPU::REG_SEQUENCE: { |
658 | if (TRI->isSGPRClass(RC: TII->getOpRegClass(MI, OpNo: 0))) { |
659 | for (MachineOperand &MO : MI.operands()) { |
660 | if (!MO.isReg() || !MO.getReg().isVirtual()) |
661 | continue; |
662 | const TargetRegisterClass *SrcRC = MRI->getRegClass(Reg: MO.getReg()); |
663 | if (TRI->hasVectorRegisters(RC: SrcRC)) { |
664 | const TargetRegisterClass *DestRC = |
665 | TRI->getEquivalentSGPRClass(VRC: SrcRC); |
666 | Register NewDst = MRI->createVirtualRegister(RegClass: DestRC); |
667 | MachineBasicBlock *BlockToInsertCopy = |
668 | MI.isPHI() ? MI.getOperand(i: MO.getOperandNo() + 1).getMBB() |
669 | : MBB; |
670 | MachineBasicBlock::iterator PointToInsertCopy = |
671 | MI.isPHI() ? BlockToInsertCopy->getFirstInstrTerminator() : I; |
672 | |
673 | if (!tryMoveVGPRConstToSGPR(MO, NewDst, BlockToInsertTo: BlockToInsertCopy, |
674 | PointToInsertTo: PointToInsertCopy)) { |
675 | MachineInstr *NewCopy = |
676 | BuildMI(*BlockToInsertCopy, PointToInsertCopy, |
677 | PointToInsertCopy->getDebugLoc(), |
678 | TII->get(AMDGPU::COPY), NewDst) |
679 | .addReg(MO.getReg()); |
680 | MO.setReg(NewDst); |
681 | analyzeVGPRToSGPRCopy(MI: NewCopy); |
682 | } |
683 | } |
684 | } |
685 | } |
686 | |
687 | if (MI.isPHI()) |
688 | PHINodes.push_back(Elt: &MI); |
689 | else if (MI.isRegSequence()) |
690 | RegSequences.push_back(Elt: &MI); |
691 | |
692 | break; |
693 | } |
694 | case AMDGPU::V_WRITELANE_B32: { |
695 | // Some architectures allow more than one constant bus access without |
696 | // SGPR restriction |
697 | if (ST.getConstantBusLimit(Opcode: MI.getOpcode()) != 1) |
698 | break; |
699 | |
700 | // Writelane is special in that it can use SGPR and M0 (which would |
701 | // normally count as using the constant bus twice - but in this case it |
702 | // is allowed since the lane selector doesn't count as a use of the |
703 | // constant bus). However, it is still required to abide by the 1 SGPR |
704 | // rule. Apply a fix here as we might have multiple SGPRs after |
705 | // legalizing VGPRs to SGPRs |
706 | int Src0Idx = |
707 | AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); |
708 | int Src1Idx = |
709 | AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1); |
710 | MachineOperand &Src0 = MI.getOperand(i: Src0Idx); |
711 | MachineOperand &Src1 = MI.getOperand(i: Src1Idx); |
712 | |
713 | // Check to see if the instruction violates the 1 SGPR rule |
714 | if ((Src0.isReg() && TRI->isSGPRReg(MRI: *MRI, Reg: Src0.getReg()) && |
715 | Src0.getReg() != AMDGPU::M0) && |
716 | (Src1.isReg() && TRI->isSGPRReg(MRI: *MRI, Reg: Src1.getReg()) && |
717 | Src1.getReg() != AMDGPU::M0)) { |
718 | |
719 | // Check for trivially easy constant prop into one of the operands |
720 | // If this is the case then perform the operation now to resolve SGPR |
721 | // issue. If we don't do that here we will always insert a mov to m0 |
722 | // that can't be resolved in later operand folding pass |
723 | bool Resolved = false; |
724 | for (MachineOperand *MO : {&Src0, &Src1}) { |
725 | if (MO->getReg().isVirtual()) { |
726 | MachineInstr *DefMI = MRI->getVRegDef(Reg: MO->getReg()); |
727 | if (DefMI && TII->isFoldableCopy(MI: *DefMI)) { |
728 | const MachineOperand &Def = DefMI->getOperand(i: 0); |
729 | if (Def.isReg() && |
730 | MO->getReg() == Def.getReg() && |
731 | MO->getSubReg() == Def.getSubReg()) { |
732 | const MachineOperand &Copied = DefMI->getOperand(i: 1); |
733 | if (Copied.isImm() && |
734 | TII->isInlineConstant(Imm: APInt(64, Copied.getImm(), true))) { |
735 | MO->ChangeToImmediate(ImmVal: Copied.getImm()); |
736 | Resolved = true; |
737 | break; |
738 | } |
739 | } |
740 | } |
741 | } |
742 | } |
743 | |
744 | if (!Resolved) { |
745 | // Haven't managed to resolve by replacing an SGPR with an immediate |
746 | // Move src1 to be in M0 |
747 | BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), |
748 | TII->get(AMDGPU::COPY), AMDGPU::M0) |
749 | .add(Src1); |
750 | Src1.ChangeToRegister(AMDGPU::Reg: M0, isDef: false); |
751 | } |
752 | } |
753 | break; |
754 | } |
755 | } |
756 | } |
757 | } |
758 | |
759 | lowerVGPR2SGPRCopies(MF); |
760 | // Postprocessing |
761 | fixSCCCopies(MF); |
762 | for (auto MI : S2VCopies) { |
763 | // Check if it is still valid |
764 | if (MI->isCopy()) { |
765 | const TargetRegisterClass *SrcRC, *DstRC; |
766 | std::tie(args&: SrcRC, args&: DstRC) = getCopyRegClasses(Copy: *MI, TRI: *TRI, MRI: *MRI); |
767 | if (isSGPRToVGPRCopy(SrcRC, DstRC, TRI: *TRI)) |
768 | tryChangeVGPRtoSGPRinCopy(MI&: *MI, TRI, TII); |
769 | } |
770 | } |
771 | for (auto MI : RegSequences) { |
772 | // Check if it is still valid |
773 | if (MI->isRegSequence()) |
774 | foldVGPRCopyIntoRegSequence(MI&: *MI, TRI, TII, MRI&: *MRI); |
775 | } |
776 | for (auto MI : PHINodes) { |
777 | processPHINode(MI&: *MI); |
778 | } |
779 | if (MF.getTarget().getOptLevel() > CodeGenOptLevel::None && EnableM0Merge) |
780 | hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII); |
781 | |
782 | SiblingPenalty.clear(); |
783 | V2SCopies.clear(); |
784 | SCCCopies.clear(); |
785 | RegSequences.clear(); |
786 | PHINodes.clear(); |
787 | S2VCopies.clear(); |
788 | |
789 | return true; |
790 | } |
791 | |
792 | void SIFixSGPRCopies::processPHINode(MachineInstr &MI) { |
793 | bool AllAGPRUses = true; |
794 | SetVector<const MachineInstr *> worklist; |
795 | SmallSet<const MachineInstr *, 4> Visited; |
796 | SetVector<MachineInstr *> PHIOperands; |
797 | worklist.insert(X: &MI); |
798 | Visited.insert(Ptr: &MI); |
799 | // HACK to make MIR tests with no uses happy |
800 | bool HasUses = false; |
801 | while (!worklist.empty()) { |
802 | const MachineInstr *Instr = worklist.pop_back_val(); |
803 | Register Reg = Instr->getOperand(i: 0).getReg(); |
804 | for (const auto &Use : MRI->use_operands(Reg)) { |
805 | HasUses = true; |
806 | const MachineInstr *UseMI = Use.getParent(); |
807 | AllAGPRUses &= (UseMI->isCopy() && |
808 | TRI->isAGPR(MRI: *MRI, Reg: UseMI->getOperand(i: 0).getReg())) || |
809 | TRI->isAGPR(MRI: *MRI, Reg: Use.getReg()); |
810 | if (UseMI->isCopy() || UseMI->isRegSequence()) { |
811 | if (Visited.insert(Ptr: UseMI).second) |
812 | worklist.insert(X: UseMI); |
813 | |
814 | continue; |
815 | } |
816 | } |
817 | } |
818 | |
819 | Register PHIRes = MI.getOperand(i: 0).getReg(); |
820 | const TargetRegisterClass *RC0 = MRI->getRegClass(Reg: PHIRes); |
821 | if (HasUses && AllAGPRUses && !TRI->isAGPRClass(RC: RC0)) { |
822 | LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI); |
823 | MRI->setRegClass(Reg: PHIRes, RC: TRI->getEquivalentAGPRClass(SRC: RC0)); |
824 | for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { |
825 | MachineInstr *DefMI = MRI->getVRegDef(Reg: MI.getOperand(i: I).getReg()); |
826 | if (DefMI && DefMI->isPHI()) |
827 | PHIOperands.insert(X: DefMI); |
828 | } |
829 | } |
830 | |
831 | if (TRI->isVectorRegister(MRI: *MRI, Reg: PHIRes) || |
832 | RC0 == &AMDGPU::VReg_1RegClass) { |
833 | LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI); |
834 | TII->legalizeOperands(MI, MDT); |
835 | } |
836 | |
837 | // Propagate register class back to PHI operands which are PHI themselves. |
838 | while (!PHIOperands.empty()) { |
839 | processPHINode(MI&: *PHIOperands.pop_back_val()); |
840 | } |
841 | } |
842 | |
843 | bool SIFixSGPRCopies::tryMoveVGPRConstToSGPR( |
844 | MachineOperand &MaybeVGPRConstMO, Register DstReg, |
845 | MachineBasicBlock *BlockToInsertTo, |
846 | MachineBasicBlock::iterator PointToInsertTo) { |
847 | |
848 | MachineInstr *DefMI = MRI->getVRegDef(Reg: MaybeVGPRConstMO.getReg()); |
849 | if (!DefMI || !DefMI->isMoveImmediate()) |
850 | return false; |
851 | |
852 | MachineOperand *SrcConst = TII->getNamedOperand(*DefMI, AMDGPU::OpName::src0); |
853 | if (SrcConst->isReg()) |
854 | return false; |
855 | |
856 | const TargetRegisterClass *SrcRC = |
857 | MRI->getRegClass(Reg: MaybeVGPRConstMO.getReg()); |
858 | unsigned MoveSize = TRI->getRegSizeInBits(*SrcRC); |
859 | unsigned MoveOp = MoveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; |
860 | BuildMI(*BlockToInsertTo, PointToInsertTo, PointToInsertTo->getDebugLoc(), |
861 | TII->get(MoveOp), DstReg) |
862 | .add(*SrcConst); |
863 | if (MRI->hasOneUse(RegNo: MaybeVGPRConstMO.getReg())) |
864 | DefMI->eraseFromParent(); |
865 | MaybeVGPRConstMO.setReg(DstReg); |
866 | return true; |
867 | } |
868 | |
869 | bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI, |
870 | MachineBasicBlock::iterator &I) { |
871 | Register DstReg = MI.getOperand(i: 0).getReg(); |
872 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
873 | if (!DstReg.isVirtual()) { |
874 | // If the destination register is a physical register there isn't |
875 | // really much we can do to fix this. |
876 | // Some special instructions use M0 as an input. Some even only use |
877 | // the first lane. Insert a readfirstlane and hope for the best. |
878 | if (DstReg == AMDGPU::M0 && |
879 | TRI->hasVectorRegisters(MRI->getRegClass(SrcReg))) { |
880 | Register TmpReg = |
881 | MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); |
882 | BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), |
883 | TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg) |
884 | .add(MI.getOperand(1)); |
885 | MI.getOperand(i: 1).setReg(TmpReg); |
886 | } else if (tryMoveVGPRConstToSGPR(MaybeVGPRConstMO&: MI.getOperand(i: 1), DstReg, BlockToInsertTo: MI.getParent(), |
887 | PointToInsertTo: MI)) { |
888 | I = std::next(x: I); |
889 | MI.eraseFromParent(); |
890 | } |
891 | return true; |
892 | } |
893 | if (!SrcReg.isVirtual() || TRI->isAGPR(MRI: *MRI, Reg: SrcReg)) { |
894 | SIInstrWorklist worklist; |
895 | worklist.insert(MI: &MI); |
896 | TII->moveToVALU(Worklist&: worklist, MDT); |
897 | return true; |
898 | } |
899 | |
900 | unsigned SMovOp; |
901 | int64_t Imm; |
902 | // If we are just copying an immediate, we can replace the copy with |
903 | // s_mov_b32. |
904 | if (isSafeToFoldImmIntoCopy(Copy: &MI, MoveImm: MRI->getVRegDef(Reg: SrcReg), TII, SMovOp, Imm)) { |
905 | MI.getOperand(i: 1).ChangeToImmediate(ImmVal: Imm); |
906 | MI.addImplicitDefUseOperands(MF&: *MI.getParent()->getParent()); |
907 | MI.setDesc(TII->get(SMovOp)); |
908 | return true; |
909 | } |
910 | return false; |
911 | } |
912 | |
913 | void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) { |
914 | Register DstReg = MI->getOperand(i: 0).getReg(); |
915 | const TargetRegisterClass *DstRC = MRI->getRegClass(Reg: DstReg); |
916 | |
917 | V2SCopyInfo Info(getNextVGPRToSGPRCopyId(), MI, |
918 | TRI->getRegSizeInBits(*DstRC)); |
919 | SmallVector<MachineInstr *, 8> AnalysisWorklist; |
920 | // Needed because the SSA is not a tree but a graph and may have |
921 | // forks and joins. We should not then go same way twice. |
922 | DenseSet<MachineInstr *> Visited; |
923 | AnalysisWorklist.push_back(Elt: Info.Copy); |
924 | while (!AnalysisWorklist.empty()) { |
925 | |
926 | MachineInstr *Inst = AnalysisWorklist.pop_back_val(); |
927 | |
928 | if (!Visited.insert(V: Inst).second) |
929 | continue; |
930 | |
931 | // Copies and REG_SEQUENCE do not contribute to the final assembly |
932 | // So, skip them but take care of the SGPR to VGPR copies bookkeeping. |
933 | if (Inst->isCopy() || Inst->isRegSequence()) { |
934 | if (TRI->isVGPR(MRI: *MRI, Reg: Inst->getOperand(i: 0).getReg())) { |
935 | if (!Inst->isCopy() || |
936 | !tryChangeVGPRtoSGPRinCopy(MI&: *Inst, TRI, TII)) { |
937 | Info.NumSVCopies++; |
938 | continue; |
939 | } |
940 | } |
941 | } |
942 | |
943 | SiblingPenalty[Inst].insert(X: Info.ID); |
944 | |
945 | SmallVector<MachineInstr *, 4> Users; |
946 | if ((TII->isSALU(*Inst) && Inst->isCompare()) || |
947 | (Inst->isCopy() && Inst->getOperand(0).getReg() == AMDGPU::SCC)) { |
948 | auto I = Inst->getIterator(); |
949 | auto E = Inst->getParent()->end(); |
950 | while (++I != E && |
951 | !I->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)) { |
952 | if (I->readsRegister(AMDGPU::SCC, /*TRI=*/nullptr)) |
953 | Users.push_back(Elt: &*I); |
954 | } |
955 | } else if (Inst->getNumExplicitDefs() != 0) { |
956 | Register Reg = Inst->getOperand(i: 0).getReg(); |
957 | if (TRI->isSGPRReg(MRI: *MRI, Reg) && !TII->isVALU(MI: *Inst)) |
958 | for (auto &U : MRI->use_instructions(Reg)) |
959 | Users.push_back(Elt: &U); |
960 | } |
961 | for (auto U : Users) { |
962 | if (TII->isSALU(MI: *U)) |
963 | Info.SChain.insert(X: U); |
964 | AnalysisWorklist.push_back(Elt: U); |
965 | } |
966 | } |
967 | V2SCopies[Info.ID] = Info; |
968 | } |
969 | |
970 | // The main function that computes the VGPR to SGPR copy score |
971 | // and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU |
972 | bool SIFixSGPRCopies::needToBeConvertedToVALU(V2SCopyInfo *Info) { |
973 | if (Info->SChain.empty()) { |
974 | Info->Score = 0; |
975 | return true; |
976 | } |
977 | Info->Siblings = SiblingPenalty[*llvm::max_element( |
978 | Range&: Info->SChain, C: [&](MachineInstr *A, MachineInstr *B) -> bool { |
979 | return SiblingPenalty[A].size() < SiblingPenalty[B].size(); |
980 | })]; |
981 | Info->Siblings.remove_if(P: [&](unsigned ID) { return ID == Info->ID; }); |
982 | // The loop below computes the number of another VGPR to SGPR V2SCopies |
983 | // which contribute to the current copy SALU chain. We assume that all the |
984 | // V2SCopies with the same source virtual register will be squashed to one |
985 | // by regalloc. Also we take care of the V2SCopies of the differnt subregs |
986 | // of the same register. |
987 | SmallSet<std::pair<Register, unsigned>, 4> SrcRegs; |
988 | for (auto J : Info->Siblings) { |
989 | auto InfoIt = V2SCopies.find(Key: J); |
990 | if (InfoIt != V2SCopies.end()) { |
991 | MachineInstr *SiblingCopy = InfoIt->second.Copy; |
992 | if (SiblingCopy->isImplicitDef()) |
993 | // the COPY has already been MoveToVALUed |
994 | continue; |
995 | |
996 | SrcRegs.insert(V: std::pair(SiblingCopy->getOperand(i: 1).getReg(), |
997 | SiblingCopy->getOperand(i: 1).getSubReg())); |
998 | } |
999 | } |
1000 | Info->SiblingPenalty = SrcRegs.size(); |
1001 | |
1002 | unsigned Penalty = |
1003 | Info->NumSVCopies + Info->SiblingPenalty + Info->NumReadfirstlanes; |
1004 | unsigned Profit = Info->SChain.size(); |
1005 | Info->Score = Penalty > Profit ? 0 : Profit - Penalty; |
1006 | Info->NeedToBeConvertedToVALU = Info->Score < 3; |
1007 | return Info->NeedToBeConvertedToVALU; |
1008 | } |
1009 | |
1010 | void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { |
1011 | |
1012 | SmallVector<unsigned, 8> LoweringWorklist; |
1013 | for (auto &C : V2SCopies) { |
1014 | if (needToBeConvertedToVALU(Info: &C.second)) |
1015 | LoweringWorklist.push_back(Elt: C.second.ID); |
1016 | } |
1017 | |
1018 | // Store all the V2S copy instructions that need to be moved to VALU |
1019 | // in the Copies worklist. |
1020 | SIInstrWorklist Copies; |
1021 | |
1022 | while (!LoweringWorklist.empty()) { |
1023 | unsigned CurID = LoweringWorklist.pop_back_val(); |
1024 | auto CurInfoIt = V2SCopies.find(Key: CurID); |
1025 | if (CurInfoIt != V2SCopies.end()) { |
1026 | V2SCopyInfo C = CurInfoIt->second; |
1027 | LLVM_DEBUG(dbgs() << "Processing ...\n" ; C.dump()); |
1028 | for (auto S : C.Siblings) { |
1029 | auto SibInfoIt = V2SCopies.find(Key: S); |
1030 | if (SibInfoIt != V2SCopies.end()) { |
1031 | V2SCopyInfo &SI = SibInfoIt->second; |
1032 | LLVM_DEBUG(dbgs() << "Sibling:\n" ; SI.dump()); |
1033 | if (!SI.NeedToBeConvertedToVALU) { |
1034 | SI.SChain.set_subtract(C.SChain); |
1035 | if (needToBeConvertedToVALU(Info: &SI)) |
1036 | LoweringWorklist.push_back(Elt: SI.ID); |
1037 | } |
1038 | SI.Siblings.remove_if(P: [&](unsigned ID) { return ID == C.ID; }); |
1039 | } |
1040 | } |
1041 | LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy |
1042 | << " is being turned to VALU\n" ); |
1043 | // TODO: MapVector::erase is inefficient. Do bulk removal with remove_if |
1044 | // instead. |
1045 | V2SCopies.erase(Key: C.ID); |
1046 | Copies.insert(MI: C.Copy); |
1047 | } |
1048 | } |
1049 | |
1050 | TII->moveToVALU(Worklist&: Copies, MDT); |
1051 | Copies.clear(); |
1052 | |
1053 | // Now do actual lowering |
1054 | for (auto C : V2SCopies) { |
1055 | MachineInstr *MI = C.second.Copy; |
1056 | MachineBasicBlock *MBB = MI->getParent(); |
1057 | // We decide to turn V2S copy to v_readfirstlane_b32 |
1058 | // remove it from the V2SCopies and remove it from all its siblings |
1059 | LLVM_DEBUG(dbgs() << "V2S copy " << *MI |
1060 | << " is being turned to v_readfirstlane_b32" |
1061 | << " Score: " << C.second.Score << "\n" ); |
1062 | Register DstReg = MI->getOperand(i: 0).getReg(); |
1063 | Register SrcReg = MI->getOperand(i: 1).getReg(); |
1064 | unsigned SubReg = MI->getOperand(i: 1).getSubReg(); |
1065 | const TargetRegisterClass *SrcRC = |
1066 | TRI->getRegClassForOperandReg(MRI: *MRI, MO: MI->getOperand(i: 1)); |
1067 | size_t SrcSize = TRI->getRegSizeInBits(*SrcRC); |
1068 | if (SrcSize == 16) { |
1069 | // HACK to handle possible 16bit VGPR source |
1070 | auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), |
1071 | TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg); |
1072 | MIB.addReg(SrcReg, 0, AMDGPU::NoSubRegister); |
1073 | } else if (SrcSize == 32) { |
1074 | auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), |
1075 | TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg); |
1076 | MIB.addReg(SrcReg, 0, SubReg); |
1077 | } else { |
1078 | auto Result = BuildMI(*MBB, MI, MI->getDebugLoc(), |
1079 | TII->get(AMDGPU::REG_SEQUENCE), DstReg); |
1080 | int N = TRI->getRegSizeInBits(*SrcRC) / 32; |
1081 | for (int i = 0; i < N; i++) { |
1082 | Register PartialSrc = TII->buildExtractSubReg( |
1083 | Result, *MRI, MI->getOperand(1), SrcRC, |
1084 | TRI->getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass); |
1085 | Register PartialDst = |
1086 | MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); |
1087 | BuildMI(*MBB, *Result, Result->getDebugLoc(), |
1088 | TII->get(AMDGPU::V_READFIRSTLANE_B32), PartialDst) |
1089 | .addReg(PartialSrc); |
1090 | Result.addReg(PartialDst).addImm(TRI->getSubRegFromChannel(Channel: i)); |
1091 | } |
1092 | } |
1093 | MI->eraseFromParent(); |
1094 | } |
1095 | } |
1096 | |
1097 | void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) { |
1098 | bool IsWave32 = MF.getSubtarget<GCNSubtarget>().isWave32(); |
1099 | for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; |
1100 | ++BI) { |
1101 | MachineBasicBlock *MBB = &*BI; |
1102 | for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; |
1103 | ++I) { |
1104 | MachineInstr &MI = *I; |
1105 | // May already have been lowered. |
1106 | if (!MI.isCopy()) |
1107 | continue; |
1108 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
1109 | Register DstReg = MI.getOperand(i: 0).getReg(); |
1110 | if (SrcReg == AMDGPU::SCC) { |
1111 | Register SCCCopy = MRI->createVirtualRegister( |
1112 | TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID)); |
1113 | I = BuildMI(*MI.getParent(), std::next(MachineBasicBlock::iterator(MI)), |
1114 | MI.getDebugLoc(), |
1115 | TII->get(IsWave32 ? AMDGPU::S_CSELECT_B32 |
1116 | : AMDGPU::S_CSELECT_B64), |
1117 | SCCCopy) |
1118 | .addImm(-1) |
1119 | .addImm(0); |
1120 | I = BuildMI(*MI.getParent(), std::next(I), I->getDebugLoc(), |
1121 | TII->get(AMDGPU::COPY), DstReg) |
1122 | .addReg(SCCCopy); |
1123 | MI.eraseFromParent(); |
1124 | continue; |
1125 | } |
1126 | if (DstReg == AMDGPU::SCC) { |
1127 | unsigned Opcode = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; |
1128 | Register Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; |
1129 | Register Tmp = MRI->createVirtualRegister(RegClass: TRI->getBoolRC()); |
1130 | I = BuildMI(*MI.getParent(), std::next(x: MachineBasicBlock::iterator(MI)), |
1131 | MI.getDebugLoc(), TII->get(Opcode)) |
1132 | .addReg(Tmp, getDefRegState(B: true)) |
1133 | .addReg(SrcReg) |
1134 | .addReg(Exec); |
1135 | MI.eraseFromParent(); |
1136 | } |
1137 | } |
1138 | } |
1139 | } |
1140 | |