1 | //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// \brief Analyzes how many registers and other resources are used by |
11 | /// functions. |
12 | /// |
13 | /// The results of this analysis are used to fill the register usage, flat |
14 | /// usage, etc. into hardware registers. |
15 | /// |
16 | /// The analysis takes callees into account. E.g. if a function A that needs 10 |
17 | /// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A |
18 | /// will return 20. |
19 | /// It is assumed that an indirect call can go into any function except |
20 | /// hardware-entrypoints. Therefore the register usage of functions with |
21 | /// indirect calls is estimated as the maximum of all non-entrypoint functions |
22 | /// in the module. |
23 | /// |
24 | //===----------------------------------------------------------------------===// |
25 | |
26 | #include "AMDGPUResourceUsageAnalysis.h" |
27 | #include "AMDGPU.h" |
28 | #include "GCNSubtarget.h" |
29 | #include "SIMachineFunctionInfo.h" |
30 | #include "llvm/ADT/PostOrderIterator.h" |
31 | #include "llvm/Analysis/CallGraph.h" |
32 | #include "llvm/CodeGen/MachineFrameInfo.h" |
33 | #include "llvm/CodeGen/TargetPassConfig.h" |
34 | #include "llvm/IR/GlobalAlias.h" |
35 | #include "llvm/IR/GlobalValue.h" |
36 | #include "llvm/Target/TargetMachine.h" |
37 | |
38 | using namespace llvm; |
39 | using namespace llvm::AMDGPU; |
40 | |
41 | #define DEBUG_TYPE "amdgpu-resource-usage" |
42 | |
43 | char llvm::AMDGPUResourceUsageAnalysis::ID = 0; |
44 | char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID; |
45 | |
46 | // In code object v4 and older, we need to tell the runtime some amount ahead of |
47 | // time if we don't know the true stack size. Assume a smaller number if this is |
48 | // only due to dynamic / non-entry block allocas. |
49 | static cl::opt<uint32_t> clAssumedStackSizeForExternalCall( |
50 | "amdgpu-assume-external-call-stack-size" , |
51 | cl::desc("Assumed stack use of any external call (in bytes)" ), cl::Hidden, |
52 | cl::init(Val: 16384)); |
53 | |
54 | static cl::opt<uint32_t> clAssumedStackSizeForDynamicSizeObjects( |
55 | "amdgpu-assume-dynamic-stack-object-size" , |
56 | cl::desc("Assumed extra stack use if there are any " |
57 | "variable sized objects (in bytes)" ), |
58 | cl::Hidden, cl::init(Val: 4096)); |
59 | |
60 | INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE, |
61 | "Function register usage analysis" , true, true) |
62 | |
63 | static const Function *getCalleeFunction(const MachineOperand &Op) { |
64 | if (Op.isImm()) { |
65 | assert(Op.getImm() == 0); |
66 | return nullptr; |
67 | } |
68 | if (auto *GA = dyn_cast<GlobalAlias>(Val: Op.getGlobal())) |
69 | return cast<Function>(Val: GA->getOperand(i_nocapture: 0)); |
70 | return cast<Function>(Val: Op.getGlobal()); |
71 | } |
72 | |
73 | static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, |
74 | const SIInstrInfo &TII, unsigned Reg) { |
75 | for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) { |
76 | if (!UseOp.isImplicit() || !TII.isFLAT(MI: *UseOp.getParent())) |
77 | return true; |
78 | } |
79 | |
80 | return false; |
81 | } |
82 | |
83 | int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs( |
84 | const GCNSubtarget &ST) const { |
85 | return NumExplicitSGPR + |
86 | IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch, |
87 | ST.getTargetID().isXnackOnOrAny()); |
88 | } |
89 | |
90 | int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs( |
91 | const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const { |
92 | return AMDGPU::getTotalNumVGPRs(has90AInsts: ST.hasGFX90AInsts(), ArgNumAGPR, ArgNumVGPR); |
93 | } |
94 | |
95 | int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs( |
96 | const GCNSubtarget &ST) const { |
97 | return getTotalNumVGPRs(ST, ArgNumAGPR: NumAGPR, ArgNumVGPR: NumVGPR); |
98 | } |
99 | |
100 | bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) { |
101 | auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); |
102 | if (!TPC) |
103 | return false; |
104 | |
105 | MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); |
106 | const TargetMachine &TM = TPC->getTM<TargetMachine>(); |
107 | const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo(); |
108 | bool HasIndirectCall = false; |
109 | |
110 | CallGraph CG = CallGraph(M); |
111 | auto End = po_end(G: &CG); |
112 | |
113 | // By default, for code object v5 and later, track only the minimum scratch |
114 | // size |
115 | uint32_t AssumedStackSizeForDynamicSizeObjects = |
116 | clAssumedStackSizeForDynamicSizeObjects; |
117 | uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall; |
118 | if (AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5 || |
119 | STI.getTargetTriple().getOS() == Triple::AMDPAL) { |
120 | if (clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences() == 0) |
121 | AssumedStackSizeForDynamicSizeObjects = 0; |
122 | if (clAssumedStackSizeForExternalCall.getNumOccurrences() == 0) |
123 | AssumedStackSizeForExternalCall = 0; |
124 | } |
125 | |
126 | for (auto IT = po_begin(G: &CG); IT != End; ++IT) { |
127 | Function *F = IT->getFunction(); |
128 | if (!F || F->isDeclaration()) |
129 | continue; |
130 | |
131 | MachineFunction *MF = MMI.getMachineFunction(F: *F); |
132 | assert(MF && "function must have been generated already" ); |
133 | |
134 | auto CI = |
135 | CallGraphResourceInfo.insert(KV: std::pair(F, SIFunctionResourceInfo())); |
136 | SIFunctionResourceInfo &Info = CI.first->second; |
137 | assert(CI.second && "should only be called once per function" ); |
138 | Info = analyzeResourceUsage(MF: *MF, TM, AssumedStackSizeForDynamicSizeObjects, |
139 | AssumedStackSizeForExternalCall); |
140 | HasIndirectCall |= Info.HasIndirectCall; |
141 | } |
142 | |
143 | // It's possible we have unreachable functions in the module which weren't |
144 | // visited by the PO traversal. Make sure we have some resource counts to |
145 | // report. |
146 | for (const auto &IT : CG) { |
147 | const Function *F = IT.first; |
148 | if (!F || F->isDeclaration()) |
149 | continue; |
150 | |
151 | auto CI = |
152 | CallGraphResourceInfo.insert(KV: std::pair(F, SIFunctionResourceInfo())); |
153 | if (!CI.second) // Skip already visited functions |
154 | continue; |
155 | |
156 | SIFunctionResourceInfo &Info = CI.first->second; |
157 | MachineFunction *MF = MMI.getMachineFunction(F: *F); |
158 | assert(MF && "function must have been generated already" ); |
159 | Info = analyzeResourceUsage(MF: *MF, TM, AssumedStackSizeForDynamicSizeObjects, |
160 | AssumedStackSizeForExternalCall); |
161 | HasIndirectCall |= Info.HasIndirectCall; |
162 | } |
163 | |
164 | if (HasIndirectCall) |
165 | propagateIndirectCallRegisterUsage(); |
166 | |
167 | return false; |
168 | } |
169 | |
170 | AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo |
171 | AMDGPUResourceUsageAnalysis::analyzeResourceUsage( |
172 | const MachineFunction &MF, const TargetMachine &TM, |
173 | uint32_t AssumedStackSizeForDynamicSizeObjects, |
174 | uint32_t AssumedStackSizeForExternalCall) const { |
175 | SIFunctionResourceInfo Info; |
176 | |
177 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
178 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
179 | const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); |
180 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
181 | const SIInstrInfo *TII = ST.getInstrInfo(); |
182 | const SIRegisterInfo &TRI = TII->getRegisterInfo(); |
183 | |
184 | Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::PhysReg: FLAT_SCR_LO) || |
185 | MRI.isPhysRegUsed(AMDGPU::PhysReg: FLAT_SCR_HI) || |
186 | MRI.isLiveIn(Reg: MFI->getPreloadedReg( |
187 | Value: AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT)); |
188 | |
189 | // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat |
190 | // instructions aren't used to access the scratch buffer. Inline assembly may |
191 | // need it though. |
192 | // |
193 | // If we only have implicit uses of flat_scr on flat instructions, it is not |
194 | // really needed. |
195 | if (Info.UsesFlatScratch && !MFI->getUserSGPRInfo().hasFlatScratchInit() && |
196 | (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) && |
197 | !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) && |
198 | !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) { |
199 | Info.UsesFlatScratch = false; |
200 | } |
201 | |
202 | Info.PrivateSegmentSize = FrameInfo.getStackSize(); |
203 | |
204 | // Assume a big number if there are any unknown sized objects. |
205 | Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); |
206 | if (Info.HasDynamicallySizedStack) |
207 | Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects; |
208 | |
209 | if (MFI->isStackRealigned()) |
210 | Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value(); |
211 | |
212 | Info.UsesVCC = |
213 | MRI.isPhysRegUsed(AMDGPU::PhysReg: VCC_LO) || MRI.isPhysRegUsed(AMDGPU::PhysReg: VCC_HI); |
214 | |
215 | // If there are no calls, MachineRegisterInfo can tell us the used register |
216 | // count easily. |
217 | // A tail call isn't considered a call for MachineFrameInfo's purposes. |
218 | if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) { |
219 | MCPhysReg HighestVGPRReg = AMDGPU::NoRegister; |
220 | for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { |
221 | if (MRI.isPhysRegUsed(Reg)) { |
222 | HighestVGPRReg = Reg; |
223 | break; |
224 | } |
225 | } |
226 | |
227 | if (ST.hasMAIInsts()) { |
228 | MCPhysReg HighestAGPRReg = AMDGPU::NoRegister; |
229 | for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) { |
230 | if (MRI.isPhysRegUsed(Reg)) { |
231 | HighestAGPRReg = Reg; |
232 | break; |
233 | } |
234 | } |
235 | Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister |
236 | ? 0 |
237 | : TRI.getHWRegIndex(Reg: HighestAGPRReg) + 1; |
238 | } |
239 | |
240 | MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; |
241 | for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { |
242 | if (MRI.isPhysRegUsed(Reg)) { |
243 | HighestSGPRReg = Reg; |
244 | break; |
245 | } |
246 | } |
247 | |
248 | // We found the maximum register index. They start at 0, so add one to get |
249 | // the number of registers. |
250 | Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister |
251 | ? 0 |
252 | : TRI.getHWRegIndex(Reg: HighestVGPRReg) + 1; |
253 | Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister |
254 | ? 0 |
255 | : TRI.getHWRegIndex(Reg: HighestSGPRReg) + 1; |
256 | |
257 | return Info; |
258 | } |
259 | |
260 | int32_t MaxVGPR = -1; |
261 | int32_t MaxAGPR = -1; |
262 | int32_t MaxSGPR = -1; |
263 | uint64_t CalleeFrameSize = 0; |
264 | |
265 | for (const MachineBasicBlock &MBB : MF) { |
266 | for (const MachineInstr &MI : MBB) { |
267 | // TODO: Check regmasks? Do they occur anywhere except calls? |
268 | for (const MachineOperand &MO : MI.operands()) { |
269 | unsigned Width = 0; |
270 | bool IsSGPR = false; |
271 | bool IsAGPR = false; |
272 | |
273 | if (!MO.isReg()) |
274 | continue; |
275 | |
276 | Register Reg = MO.getReg(); |
277 | switch (Reg) { |
278 | case AMDGPU::EXEC: |
279 | case AMDGPU::EXEC_LO: |
280 | case AMDGPU::EXEC_HI: |
281 | case AMDGPU::SCC: |
282 | case AMDGPU::M0: |
283 | case AMDGPU::M0_LO16: |
284 | case AMDGPU::M0_HI16: |
285 | case AMDGPU::SRC_SHARED_BASE_LO: |
286 | case AMDGPU::SRC_SHARED_BASE: |
287 | case AMDGPU::SRC_SHARED_LIMIT_LO: |
288 | case AMDGPU::SRC_SHARED_LIMIT: |
289 | case AMDGPU::SRC_PRIVATE_BASE_LO: |
290 | case AMDGPU::SRC_PRIVATE_BASE: |
291 | case AMDGPU::SRC_PRIVATE_LIMIT_LO: |
292 | case AMDGPU::SRC_PRIVATE_LIMIT: |
293 | case AMDGPU::SGPR_NULL: |
294 | case AMDGPU::SGPR_NULL64: |
295 | case AMDGPU::MODE: |
296 | continue; |
297 | |
298 | case AMDGPU::SRC_POPS_EXITING_WAVE_ID: |
299 | llvm_unreachable("src_pops_exiting_wave_id should not be used" ); |
300 | |
301 | case AMDGPU::NoRegister: |
302 | assert(MI.isDebugInstr() && |
303 | "Instruction uses invalid noreg register" ); |
304 | continue; |
305 | |
306 | case AMDGPU::VCC: |
307 | case AMDGPU::VCC_LO: |
308 | case AMDGPU::VCC_HI: |
309 | case AMDGPU::VCC_LO_LO16: |
310 | case AMDGPU::VCC_LO_HI16: |
311 | case AMDGPU::VCC_HI_LO16: |
312 | case AMDGPU::VCC_HI_HI16: |
313 | Info.UsesVCC = true; |
314 | continue; |
315 | |
316 | case AMDGPU::FLAT_SCR: |
317 | case AMDGPU::FLAT_SCR_LO: |
318 | case AMDGPU::FLAT_SCR_HI: |
319 | continue; |
320 | |
321 | case AMDGPU::XNACK_MASK: |
322 | case AMDGPU::XNACK_MASK_LO: |
323 | case AMDGPU::XNACK_MASK_HI: |
324 | llvm_unreachable("xnack_mask registers should not be used" ); |
325 | |
326 | case AMDGPU::LDS_DIRECT: |
327 | llvm_unreachable("lds_direct register should not be used" ); |
328 | |
329 | case AMDGPU::TBA: |
330 | case AMDGPU::TBA_LO: |
331 | case AMDGPU::TBA_HI: |
332 | case AMDGPU::TMA: |
333 | case AMDGPU::TMA_LO: |
334 | case AMDGPU::TMA_HI: |
335 | llvm_unreachable("trap handler registers should not be used" ); |
336 | |
337 | case AMDGPU::SRC_VCCZ: |
338 | llvm_unreachable("src_vccz register should not be used" ); |
339 | |
340 | case AMDGPU::SRC_EXECZ: |
341 | llvm_unreachable("src_execz register should not be used" ); |
342 | |
343 | case AMDGPU::SRC_SCC: |
344 | llvm_unreachable("src_scc register should not be used" ); |
345 | |
346 | default: |
347 | break; |
348 | } |
349 | |
350 | if (AMDGPU::SGPR_32RegClass.contains(Reg) || |
351 | AMDGPU::SGPR_LO16RegClass.contains(Reg) || |
352 | AMDGPU::SGPR_HI16RegClass.contains(Reg)) { |
353 | IsSGPR = true; |
354 | Width = 1; |
355 | } else if (AMDGPU::VGPR_32RegClass.contains(Reg) || |
356 | AMDGPU::VGPR_16RegClass.contains(Reg)) { |
357 | IsSGPR = false; |
358 | Width = 1; |
359 | } else if (AMDGPU::AGPR_32RegClass.contains(Reg) || |
360 | AMDGPU::AGPR_LO16RegClass.contains(Reg)) { |
361 | IsSGPR = false; |
362 | IsAGPR = true; |
363 | Width = 1; |
364 | } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) { |
365 | IsSGPR = true; |
366 | Width = 2; |
367 | } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { |
368 | IsSGPR = false; |
369 | Width = 2; |
370 | } else if (AMDGPU::AReg_64RegClass.contains(Reg)) { |
371 | IsSGPR = false; |
372 | IsAGPR = true; |
373 | Width = 2; |
374 | } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { |
375 | IsSGPR = false; |
376 | Width = 3; |
377 | } else if (AMDGPU::SReg_96RegClass.contains(Reg)) { |
378 | IsSGPR = true; |
379 | Width = 3; |
380 | } else if (AMDGPU::AReg_96RegClass.contains(Reg)) { |
381 | IsSGPR = false; |
382 | IsAGPR = true; |
383 | Width = 3; |
384 | } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) { |
385 | IsSGPR = true; |
386 | Width = 4; |
387 | } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { |
388 | IsSGPR = false; |
389 | Width = 4; |
390 | } else if (AMDGPU::AReg_128RegClass.contains(Reg)) { |
391 | IsSGPR = false; |
392 | IsAGPR = true; |
393 | Width = 4; |
394 | } else if (AMDGPU::VReg_160RegClass.contains(Reg)) { |
395 | IsSGPR = false; |
396 | Width = 5; |
397 | } else if (AMDGPU::SReg_160RegClass.contains(Reg)) { |
398 | IsSGPR = true; |
399 | Width = 5; |
400 | } else if (AMDGPU::AReg_160RegClass.contains(Reg)) { |
401 | IsSGPR = false; |
402 | IsAGPR = true; |
403 | Width = 5; |
404 | } else if (AMDGPU::VReg_192RegClass.contains(Reg)) { |
405 | IsSGPR = false; |
406 | Width = 6; |
407 | } else if (AMDGPU::SReg_192RegClass.contains(Reg)) { |
408 | IsSGPR = true; |
409 | Width = 6; |
410 | } else if (AMDGPU::AReg_192RegClass.contains(Reg)) { |
411 | IsSGPR = false; |
412 | IsAGPR = true; |
413 | Width = 6; |
414 | } else if (AMDGPU::VReg_224RegClass.contains(Reg)) { |
415 | IsSGPR = false; |
416 | Width = 7; |
417 | } else if (AMDGPU::SReg_224RegClass.contains(Reg)) { |
418 | IsSGPR = true; |
419 | Width = 7; |
420 | } else if (AMDGPU::AReg_224RegClass.contains(Reg)) { |
421 | IsSGPR = false; |
422 | IsAGPR = true; |
423 | Width = 7; |
424 | } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { |
425 | IsSGPR = true; |
426 | Width = 8; |
427 | } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { |
428 | IsSGPR = false; |
429 | Width = 8; |
430 | } else if (AMDGPU::AReg_256RegClass.contains(Reg)) { |
431 | IsSGPR = false; |
432 | IsAGPR = true; |
433 | Width = 8; |
434 | } else if (AMDGPU::VReg_288RegClass.contains(Reg)) { |
435 | IsSGPR = false; |
436 | Width = 9; |
437 | } else if (AMDGPU::SReg_288RegClass.contains(Reg)) { |
438 | IsSGPR = true; |
439 | Width = 9; |
440 | } else if (AMDGPU::AReg_288RegClass.contains(Reg)) { |
441 | IsSGPR = false; |
442 | IsAGPR = true; |
443 | Width = 9; |
444 | } else if (AMDGPU::VReg_320RegClass.contains(Reg)) { |
445 | IsSGPR = false; |
446 | Width = 10; |
447 | } else if (AMDGPU::SReg_320RegClass.contains(Reg)) { |
448 | IsSGPR = true; |
449 | Width = 10; |
450 | } else if (AMDGPU::AReg_320RegClass.contains(Reg)) { |
451 | IsSGPR = false; |
452 | IsAGPR = true; |
453 | Width = 10; |
454 | } else if (AMDGPU::VReg_352RegClass.contains(Reg)) { |
455 | IsSGPR = false; |
456 | Width = 11; |
457 | } else if (AMDGPU::SReg_352RegClass.contains(Reg)) { |
458 | IsSGPR = true; |
459 | Width = 11; |
460 | } else if (AMDGPU::AReg_352RegClass.contains(Reg)) { |
461 | IsSGPR = false; |
462 | IsAGPR = true; |
463 | Width = 11; |
464 | } else if (AMDGPU::VReg_384RegClass.contains(Reg)) { |
465 | IsSGPR = false; |
466 | Width = 12; |
467 | } else if (AMDGPU::SReg_384RegClass.contains(Reg)) { |
468 | IsSGPR = true; |
469 | Width = 12; |
470 | } else if (AMDGPU::AReg_384RegClass.contains(Reg)) { |
471 | IsSGPR = false; |
472 | IsAGPR = true; |
473 | Width = 12; |
474 | } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { |
475 | IsSGPR = true; |
476 | Width = 16; |
477 | } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { |
478 | IsSGPR = false; |
479 | Width = 16; |
480 | } else if (AMDGPU::AReg_512RegClass.contains(Reg)) { |
481 | IsSGPR = false; |
482 | IsAGPR = true; |
483 | Width = 16; |
484 | } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) { |
485 | IsSGPR = true; |
486 | Width = 32; |
487 | } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) { |
488 | IsSGPR = false; |
489 | Width = 32; |
490 | } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) { |
491 | IsSGPR = false; |
492 | IsAGPR = true; |
493 | Width = 32; |
494 | } else { |
495 | // We only expect TTMP registers or registers that do not belong to |
496 | // any RC. |
497 | assert((AMDGPU::TTMP_32RegClass.contains(Reg) || |
498 | AMDGPU::TTMP_64RegClass.contains(Reg) || |
499 | AMDGPU::TTMP_128RegClass.contains(Reg) || |
500 | AMDGPU::TTMP_256RegClass.contains(Reg) || |
501 | AMDGPU::TTMP_512RegClass.contains(Reg) || |
502 | !TRI.getPhysRegBaseClass(Reg)) && |
503 | "Unknown register class" ); |
504 | } |
505 | unsigned HWReg = TRI.getHWRegIndex(Reg); |
506 | int MaxUsed = HWReg + Width - 1; |
507 | if (IsSGPR) { |
508 | MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR; |
509 | } else if (IsAGPR) { |
510 | MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR; |
511 | } else { |
512 | MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; |
513 | } |
514 | } |
515 | |
516 | if (MI.isCall()) { |
517 | // Pseudo used just to encode the underlying global. Is there a better |
518 | // way to track this? |
519 | |
520 | const MachineOperand *CalleeOp = |
521 | TII->getNamedOperand(MI, AMDGPU::OpName::callee); |
522 | |
523 | const Function *Callee = getCalleeFunction(Op: *CalleeOp); |
524 | DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I = |
525 | CallGraphResourceInfo.end(); |
526 | |
527 | // Avoid crashing on undefined behavior with an illegal call to a |
528 | // kernel. If a callsite's calling convention doesn't match the |
529 | // function's, it's undefined behavior. If the callsite calling |
530 | // convention does match, that would have errored earlier. |
531 | if (Callee && AMDGPU::isEntryFunctionCC(CC: Callee->getCallingConv())) |
532 | report_fatal_error(reason: "invalid call to entry function" ); |
533 | |
534 | bool IsIndirect = !Callee || Callee->isDeclaration(); |
535 | if (!IsIndirect) |
536 | I = CallGraphResourceInfo.find(Val: Callee); |
537 | |
538 | // FIXME: Call site could have norecurse on it |
539 | if (!Callee || !Callee->doesNotRecurse()) { |
540 | Info.HasRecursion = true; |
541 | |
542 | // TODO: If we happen to know there is no stack usage in the |
543 | // callgraph, we don't need to assume an infinitely growing stack. |
544 | if (!MI.isReturn()) { |
545 | // We don't need to assume an unknown stack size for tail calls. |
546 | |
547 | // FIXME: This only benefits in the case where the kernel does not |
548 | // directly call the tail called function. If a kernel directly |
549 | // calls a tail recursive function, we'll assume maximum stack size |
550 | // based on the regular call instruction. |
551 | CalleeFrameSize = std::max( |
552 | a: CalleeFrameSize, |
553 | b: static_cast<uint64_t>(AssumedStackSizeForExternalCall)); |
554 | } |
555 | } |
556 | |
557 | if (IsIndirect || I == CallGraphResourceInfo.end()) { |
558 | CalleeFrameSize = |
559 | std::max(a: CalleeFrameSize, |
560 | b: static_cast<uint64_t>(AssumedStackSizeForExternalCall)); |
561 | |
562 | // Register usage of indirect calls gets handled later |
563 | Info.UsesVCC = true; |
564 | Info.UsesFlatScratch = ST.hasFlatAddressSpace(); |
565 | Info.HasDynamicallySizedStack = true; |
566 | Info.HasIndirectCall = true; |
567 | } else { |
568 | // We force CodeGen to run in SCC order, so the callee's register |
569 | // usage etc. should be the cumulative usage of all callees. |
570 | MaxSGPR = std::max(a: I->second.NumExplicitSGPR - 1, b: MaxSGPR); |
571 | MaxVGPR = std::max(a: I->second.NumVGPR - 1, b: MaxVGPR); |
572 | MaxAGPR = std::max(a: I->second.NumAGPR - 1, b: MaxAGPR); |
573 | CalleeFrameSize = |
574 | std::max(a: I->second.PrivateSegmentSize, b: CalleeFrameSize); |
575 | Info.UsesVCC |= I->second.UsesVCC; |
576 | Info.UsesFlatScratch |= I->second.UsesFlatScratch; |
577 | Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack; |
578 | Info.HasRecursion |= I->second.HasRecursion; |
579 | Info.HasIndirectCall |= I->second.HasIndirectCall; |
580 | } |
581 | } |
582 | } |
583 | } |
584 | |
585 | Info.NumExplicitSGPR = MaxSGPR + 1; |
586 | Info.NumVGPR = MaxVGPR + 1; |
587 | Info.NumAGPR = MaxAGPR + 1; |
588 | Info.PrivateSegmentSize += CalleeFrameSize; |
589 | |
590 | return Info; |
591 | } |
592 | |
593 | void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() { |
594 | // Collect the maximum number of registers from non-hardware-entrypoints. |
595 | // All these functions are potential targets for indirect calls. |
596 | int32_t NonKernelMaxSGPRs = 0; |
597 | int32_t NonKernelMaxVGPRs = 0; |
598 | int32_t NonKernelMaxAGPRs = 0; |
599 | |
600 | for (const auto &I : CallGraphResourceInfo) { |
601 | if (!AMDGPU::isEntryFunctionCC(CC: I.getFirst()->getCallingConv())) { |
602 | auto &Info = I.getSecond(); |
603 | NonKernelMaxSGPRs = std::max(a: NonKernelMaxSGPRs, b: Info.NumExplicitSGPR); |
604 | NonKernelMaxVGPRs = std::max(a: NonKernelMaxVGPRs, b: Info.NumVGPR); |
605 | NonKernelMaxAGPRs = std::max(a: NonKernelMaxAGPRs, b: Info.NumAGPR); |
606 | } |
607 | } |
608 | |
609 | // Add register usage for functions with indirect calls. |
610 | // For calls to unknown functions, we assume the maximum register usage of |
611 | // all non-hardware-entrypoints in the current module. |
612 | for (auto &I : CallGraphResourceInfo) { |
613 | auto &Info = I.getSecond(); |
614 | if (Info.HasIndirectCall) { |
615 | Info.NumExplicitSGPR = std::max(a: Info.NumExplicitSGPR, b: NonKernelMaxSGPRs); |
616 | Info.NumVGPR = std::max(a: Info.NumVGPR, b: NonKernelMaxVGPRs); |
617 | Info.NumAGPR = std::max(a: Info.NumAGPR, b: NonKernelMaxAGPRs); |
618 | } |
619 | } |
620 | } |
621 | |