1 | //===- SIMachineFunctionInfo.cpp - SI Machine Function Info ---------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "SIMachineFunctionInfo.h" |
10 | #include "AMDGPUSubtarget.h" |
11 | #include "AMDGPUTargetMachine.h" |
12 | #include "GCNSubtarget.h" |
13 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
14 | #include "SIRegisterInfo.h" |
15 | #include "Utils/AMDGPUBaseInfo.h" |
16 | #include "llvm/CodeGen/LiveIntervals.h" |
17 | #include "llvm/CodeGen/MIRParser/MIParser.h" |
18 | #include "llvm/CodeGen/MachineBasicBlock.h" |
19 | #include "llvm/CodeGen/MachineFrameInfo.h" |
20 | #include "llvm/CodeGen/MachineFunction.h" |
21 | #include "llvm/CodeGen/MachineRegisterInfo.h" |
22 | #include "llvm/IR/CallingConv.h" |
23 | #include "llvm/IR/DiagnosticInfo.h" |
24 | #include "llvm/IR/Function.h" |
25 | #include <cassert> |
26 | #include <optional> |
27 | #include <vector> |
28 | |
29 | #define MAX_LANES 64 |
30 | |
31 | using namespace llvm; |
32 | |
33 | const GCNTargetMachine &getTM(const GCNSubtarget *STI) { |
34 | const SITargetLowering *TLI = STI->getTargetLowering(); |
35 | return static_cast<const GCNTargetMachine &>(TLI->getTargetMachine()); |
36 | } |
37 | |
38 | SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, |
39 | const GCNSubtarget *STI) |
40 | : AMDGPUMachineFunction(F, *STI), Mode(F, *STI), GWSResourcePSV(getTM(STI)), |
41 | UserSGPRInfo(F, *STI), WorkGroupIDX(false), WorkGroupIDY(false), |
42 | WorkGroupIDZ(false), WorkGroupInfo(false), LDSKernelId(false), |
43 | PrivateSegmentWaveByteOffset(false), WorkItemIDX(false), |
44 | WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false), |
45 | GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0) { |
46 | const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI); |
47 | FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F); |
48 | WavesPerEU = ST.getWavesPerEU(F); |
49 | MaxNumWorkGroups = ST.getMaxNumWorkGroups(F); |
50 | assert(MaxNumWorkGroups.size() == 3); |
51 | |
52 | Occupancy = ST.computeOccupancy(F, LDSSize: getLDSSize()); |
53 | CallingConv::ID CC = F.getCallingConv(); |
54 | |
55 | VRegFlags.reserve(s: 1024); |
56 | |
57 | const bool IsKernel = CC == CallingConv::AMDGPU_KERNEL || |
58 | CC == CallingConv::SPIR_KERNEL; |
59 | |
60 | if (IsKernel) { |
61 | WorkGroupIDX = true; |
62 | WorkItemIDX = true; |
63 | } else if (CC == CallingConv::AMDGPU_PS) { |
64 | PSInputAddr = AMDGPU::getInitialPSInputAddr(F); |
65 | } |
66 | |
67 | MayNeedAGPRs = ST.hasMAIInsts(); |
68 | |
69 | if (AMDGPU::isChainCC(CC)) { |
70 | // Chain functions don't receive an SP from their caller, but are free to |
71 | // set one up. For now, we can use s32 to match what amdgpu_gfx functions |
72 | // would use if called, but this can be revisited. |
73 | // FIXME: Only reserve this if we actually need it. |
74 | StackPtrOffsetReg = AMDGPU::SGPR32; |
75 | |
76 | ScratchRSrcReg = AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51; |
77 | |
78 | ArgInfo.PrivateSegmentBuffer = |
79 | ArgDescriptor::createRegister(Reg: ScratchRSrcReg); |
80 | |
81 | ImplicitArgPtr = false; |
82 | } else if (!isEntryFunction()) { |
83 | if (CC != CallingConv::AMDGPU_Gfx) |
84 | ArgInfo = AMDGPUArgumentUsageInfo::FixedABIFunctionInfo; |
85 | |
86 | // TODO: Pick a high register, and shift down, similar to a kernel. |
87 | FrameOffsetReg = AMDGPU::SGPR33; |
88 | StackPtrOffsetReg = AMDGPU::SGPR32; |
89 | |
90 | if (!ST.enableFlatScratch()) { |
91 | // Non-entry functions have no special inputs for now, other registers |
92 | // required for scratch access. |
93 | ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3; |
94 | |
95 | ArgInfo.PrivateSegmentBuffer = |
96 | ArgDescriptor::createRegister(Reg: ScratchRSrcReg); |
97 | } |
98 | |
99 | if (!F.hasFnAttribute(Kind: "amdgpu-no-implicitarg-ptr" )) |
100 | ImplicitArgPtr = true; |
101 | } else { |
102 | ImplicitArgPtr = false; |
103 | MaxKernArgAlign = std::max(ST.getAlignmentForImplicitArgPtr(), |
104 | MaxKernArgAlign); |
105 | |
106 | if (ST.hasGFX90AInsts() && |
107 | ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() && |
108 | !mayUseAGPRs(F)) |
109 | MayNeedAGPRs = false; // We will select all MAI with VGPR operands. |
110 | } |
111 | |
112 | if (!AMDGPU::isGraphics(CC) || |
113 | ((CC == CallingConv::AMDGPU_CS || CC == CallingConv::AMDGPU_CS) && |
114 | ST.hasArchitectedSGPRs())) { |
115 | if (IsKernel || !F.hasFnAttribute(Kind: "amdgpu-no-workgroup-id-x" )) |
116 | WorkGroupIDX = true; |
117 | |
118 | if (!F.hasFnAttribute(Kind: "amdgpu-no-workgroup-id-y" )) |
119 | WorkGroupIDY = true; |
120 | |
121 | if (!F.hasFnAttribute(Kind: "amdgpu-no-workgroup-id-z" )) |
122 | WorkGroupIDZ = true; |
123 | } |
124 | |
125 | if (!AMDGPU::isGraphics(CC)) { |
126 | if (IsKernel || !F.hasFnAttribute(Kind: "amdgpu-no-workitem-id-x" )) |
127 | WorkItemIDX = true; |
128 | |
129 | if (!F.hasFnAttribute(Kind: "amdgpu-no-workitem-id-y" ) && |
130 | ST.getMaxWorkitemID(F, 1) != 0) |
131 | WorkItemIDY = true; |
132 | |
133 | if (!F.hasFnAttribute(Kind: "amdgpu-no-workitem-id-z" ) && |
134 | ST.getMaxWorkitemID(F, 2) != 0) |
135 | WorkItemIDZ = true; |
136 | |
137 | if (!IsKernel && !F.hasFnAttribute(Kind: "amdgpu-no-lds-kernel-id" )) |
138 | LDSKernelId = true; |
139 | } |
140 | |
141 | if (isEntryFunction()) { |
142 | // X, XY, and XYZ are the only supported combinations, so make sure Y is |
143 | // enabled if Z is. |
144 | if (WorkItemIDZ) |
145 | WorkItemIDY = true; |
146 | |
147 | if (!ST.flatScratchIsArchitected()) { |
148 | PrivateSegmentWaveByteOffset = true; |
149 | |
150 | // HS and GS always have the scratch wave offset in SGPR5 on GFX9. |
151 | if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 && |
152 | (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS)) |
153 | ArgInfo.PrivateSegmentWaveByteOffset = |
154 | ArgDescriptor::createRegister(AMDGPU::Reg: SGPR5); |
155 | } |
156 | } |
157 | |
158 | Attribute A = F.getFnAttribute(Kind: "amdgpu-git-ptr-high" ); |
159 | StringRef S = A.getValueAsString(); |
160 | if (!S.empty()) |
161 | S.consumeInteger(Radix: 0, Result&: GITPtrHigh); |
162 | |
163 | A = F.getFnAttribute(Kind: "amdgpu-32bit-address-high-bits" ); |
164 | S = A.getValueAsString(); |
165 | if (!S.empty()) |
166 | S.consumeInteger(Radix: 0, Result&: HighBitsOf32BitAddress); |
167 | |
168 | // On GFX908, in order to guarantee copying between AGPRs, we need a scratch |
169 | // VGPR available at all times. For now, reserve highest available VGPR. After |
170 | // RA, shift it to the lowest available unused VGPR if the one exist. |
171 | if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) { |
172 | VGPRForAGPRCopy = |
173 | AMDGPU::VGPR_32RegClass.getRegister(ST.getMaxNumVGPRs(F) - 1); |
174 | } |
175 | } |
176 | |
177 | MachineFunctionInfo *SIMachineFunctionInfo::clone( |
178 | BumpPtrAllocator &Allocator, MachineFunction &DestMF, |
179 | const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB) |
180 | const { |
181 | return DestMF.cloneInfo<SIMachineFunctionInfo>(Old: *this); |
182 | } |
183 | |
184 | void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) { |
185 | limitOccupancy(Limit: getMaxWavesPerEU()); |
186 | const GCNSubtarget& ST = MF.getSubtarget<GCNSubtarget>(); |
187 | limitOccupancy(Limit: ST.getOccupancyWithLocalMemSize(getLDSSize(), |
188 | MF.getFunction())); |
189 | } |
190 | |
191 | Register SIMachineFunctionInfo::addPrivateSegmentBuffer( |
192 | const SIRegisterInfo &TRI) { |
193 | ArgInfo.PrivateSegmentBuffer = |
194 | ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg( |
195 | getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SGPR_128RegClass)); |
196 | NumUserSGPRs += 4; |
197 | return ArgInfo.PrivateSegmentBuffer.getRegister(); |
198 | } |
199 | |
200 | Register SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) { |
201 | ArgInfo.DispatchPtr = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg( |
202 | getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); |
203 | NumUserSGPRs += 2; |
204 | return ArgInfo.DispatchPtr.getRegister(); |
205 | } |
206 | |
207 | Register SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) { |
208 | ArgInfo.QueuePtr = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg( |
209 | getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); |
210 | NumUserSGPRs += 2; |
211 | return ArgInfo.QueuePtr.getRegister(); |
212 | } |
213 | |
214 | Register SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) { |
215 | ArgInfo.KernargSegmentPtr |
216 | = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg( |
217 | getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); |
218 | NumUserSGPRs += 2; |
219 | return ArgInfo.KernargSegmentPtr.getRegister(); |
220 | } |
221 | |
222 | Register SIMachineFunctionInfo::addDispatchID(const SIRegisterInfo &TRI) { |
223 | ArgInfo.DispatchID = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg( |
224 | getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); |
225 | NumUserSGPRs += 2; |
226 | return ArgInfo.DispatchID.getRegister(); |
227 | } |
228 | |
229 | Register SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) { |
230 | ArgInfo.FlatScratchInit = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg( |
231 | getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); |
232 | NumUserSGPRs += 2; |
233 | return ArgInfo.FlatScratchInit.getRegister(); |
234 | } |
235 | |
236 | Register SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) { |
237 | ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(Reg: TRI.getMatchingSuperReg( |
238 | getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); |
239 | NumUserSGPRs += 2; |
240 | return ArgInfo.ImplicitBufferPtr.getRegister(); |
241 | } |
242 | |
243 | Register SIMachineFunctionInfo::addLDSKernelId() { |
244 | ArgInfo.LDSKernelId = ArgDescriptor::createRegister(Reg: getNextUserSGPR()); |
245 | NumUserSGPRs += 1; |
246 | return ArgInfo.LDSKernelId.getRegister(); |
247 | } |
248 | |
249 | SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg( |
250 | const SIRegisterInfo &TRI, const TargetRegisterClass *RC, |
251 | unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs) { |
252 | assert(!ArgInfo.PreloadKernArgs.count(KernArgIdx) && |
253 | "Preload kernel argument allocated twice." ); |
254 | NumUserSGPRs += PaddingSGPRs; |
255 | // If the available register tuples are aligned with the kernarg to be |
256 | // preloaded use that register, otherwise we need to use a set of SGPRs and |
257 | // merge them. |
258 | Register PreloadReg = |
259 | TRI.getMatchingSuperReg(getNextUserSGPR(), AMDGPU::sub0, RC); |
260 | if (PreloadReg && |
261 | (RC == &AMDGPU::SReg_32RegClass || RC == &AMDGPU::SReg_64RegClass)) { |
262 | ArgInfo.PreloadKernArgs[KernArgIdx].Regs.push_back(Elt: PreloadReg); |
263 | NumUserSGPRs += AllocSizeDWord; |
264 | } else { |
265 | for (unsigned I = 0; I < AllocSizeDWord; ++I) { |
266 | ArgInfo.PreloadKernArgs[KernArgIdx].Regs.push_back(Elt: getNextUserSGPR()); |
267 | NumUserSGPRs++; |
268 | } |
269 | } |
270 | |
271 | // Track the actual number of SGPRs that HW will preload to. |
272 | UserSGPRInfo.allocKernargPreloadSGPRs(NumSGPRs: AllocSizeDWord + PaddingSGPRs); |
273 | return &ArgInfo.PreloadKernArgs[KernArgIdx].Regs; |
274 | } |
275 | |
276 | void SIMachineFunctionInfo::allocateWWMSpill(MachineFunction &MF, Register VGPR, |
277 | uint64_t Size, Align Alignment) { |
278 | // Skip if it is an entry function or the register is already added. |
279 | if (isEntryFunction() || WWMSpills.count(Key: VGPR)) |
280 | return; |
281 | |
282 | // Skip if this is a function with the amdgpu_cs_chain or |
283 | // amdgpu_cs_chain_preserve calling convention and this is a scratch register. |
284 | // We never need to allocate a spill for these because we don't even need to |
285 | // restore the inactive lanes for them (they're scratchier than the usual |
286 | // scratch registers). |
287 | if (isChainFunction() && SIRegisterInfo::isChainScratchRegister(VGPR)) |
288 | return; |
289 | |
290 | WWMSpills.insert(KV: std::make_pair( |
291 | x&: VGPR, y: MF.getFrameInfo().CreateSpillStackObject(Size, Alignment))); |
292 | } |
293 | |
294 | // Separate out the callee-saved and scratch registers. |
295 | void SIMachineFunctionInfo::splitWWMSpillRegisters( |
296 | MachineFunction &MF, |
297 | SmallVectorImpl<std::pair<Register, int>> &CalleeSavedRegs, |
298 | SmallVectorImpl<std::pair<Register, int>> &ScratchRegs) const { |
299 | const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs(); |
300 | for (auto &Reg : WWMSpills) { |
301 | if (isCalleeSavedReg(CSRegs, Reg: Reg.first)) |
302 | CalleeSavedRegs.push_back(Elt: Reg); |
303 | else |
304 | ScratchRegs.push_back(Elt: Reg); |
305 | } |
306 | } |
307 | |
308 | bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs, |
309 | MCPhysReg Reg) const { |
310 | for (unsigned I = 0; CSRegs[I]; ++I) { |
311 | if (CSRegs[I] == Reg) |
312 | return true; |
313 | } |
314 | |
315 | return false; |
316 | } |
317 | |
318 | void SIMachineFunctionInfo::shiftSpillPhysVGPRsToLowestRange( |
319 | MachineFunction &MF) { |
320 | const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); |
321 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
322 | for (unsigned I = 0, E = SpillPhysVGPRs.size(); I < E; ++I) { |
323 | Register Reg = SpillPhysVGPRs[I]; |
324 | Register NewReg = |
325 | TRI->findUnusedRegister(MRI, RC: &AMDGPU::VGPR_32RegClass, MF); |
326 | if (!NewReg || NewReg >= Reg) |
327 | break; |
328 | |
329 | MRI.replaceRegWith(FromReg: Reg, ToReg: NewReg); |
330 | |
331 | // Update various tables with the new VGPR. |
332 | SpillPhysVGPRs[I] = NewReg; |
333 | WWMReservedRegs.remove(X: Reg); |
334 | WWMReservedRegs.insert(X: NewReg); |
335 | WWMSpills.insert(KV: std::make_pair(x&: NewReg, y&: WWMSpills[Reg])); |
336 | WWMSpills.erase(Key: Reg); |
337 | |
338 | for (MachineBasicBlock &MBB : MF) { |
339 | MBB.removeLiveIn(Reg); |
340 | MBB.sortUniqueLiveIns(); |
341 | } |
342 | } |
343 | } |
344 | |
345 | bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills( |
346 | MachineFunction &MF, int FI, unsigned LaneIndex) { |
347 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
348 | Register LaneVGPR; |
349 | if (!LaneIndex) { |
350 | LaneVGPR = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
351 | SpillVGPRs.push_back(Elt: LaneVGPR); |
352 | } else { |
353 | LaneVGPR = SpillVGPRs.back(); |
354 | } |
355 | |
356 | SGPRSpillsToVirtualVGPRLanes[FI].push_back( |
357 | x: SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex)); |
358 | return true; |
359 | } |
360 | |
361 | bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills( |
362 | MachineFunction &MF, int FI, unsigned LaneIndex, bool IsPrologEpilog) { |
363 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
364 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
365 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
366 | Register LaneVGPR; |
367 | if (!LaneIndex) { |
368 | // Find the highest available register if called before RA to ensure the |
369 | // lowest registers are available for allocation. The LaneVGPR, in that |
370 | // case, will be shifted back to the lowest range after VGPR allocation. |
371 | LaneVGPR = TRI->findUnusedRegister(MRI, RC: &AMDGPU::VGPR_32RegClass, MF, |
372 | ReserveHighestVGPR: !IsPrologEpilog); |
373 | if (LaneVGPR == AMDGPU::NoRegister) { |
374 | // We have no VGPRs left for spilling SGPRs. Reset because we will not |
375 | // partially spill the SGPR to VGPRs. |
376 | SGPRSpillsToPhysicalVGPRLanes.erase(Val: FI); |
377 | return false; |
378 | } |
379 | |
380 | allocateWWMSpill(MF, VGPR: LaneVGPR); |
381 | reserveWWMRegister(Reg: LaneVGPR); |
382 | for (MachineBasicBlock &MBB : MF) { |
383 | MBB.addLiveIn(PhysReg: LaneVGPR); |
384 | MBB.sortUniqueLiveIns(); |
385 | } |
386 | SpillPhysVGPRs.push_back(Elt: LaneVGPR); |
387 | } else { |
388 | LaneVGPR = SpillPhysVGPRs.back(); |
389 | } |
390 | |
391 | SGPRSpillsToPhysicalVGPRLanes[FI].push_back( |
392 | x: SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex)); |
393 | return true; |
394 | } |
395 | |
396 | bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane( |
397 | MachineFunction &MF, int FI, bool SpillToPhysVGPRLane, |
398 | bool IsPrologEpilog) { |
399 | std::vector<SIRegisterInfo::SpilledReg> &SpillLanes = |
400 | SpillToPhysVGPRLane ? SGPRSpillsToPhysicalVGPRLanes[FI] |
401 | : SGPRSpillsToVirtualVGPRLanes[FI]; |
402 | |
403 | // This has already been allocated. |
404 | if (!SpillLanes.empty()) |
405 | return true; |
406 | |
407 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
408 | MachineFrameInfo &FrameInfo = MF.getFrameInfo(); |
409 | unsigned WaveSize = ST.getWavefrontSize(); |
410 | |
411 | unsigned Size = FrameInfo.getObjectSize(ObjectIdx: FI); |
412 | unsigned NumLanes = Size / 4; |
413 | |
414 | if (NumLanes > WaveSize) |
415 | return false; |
416 | |
417 | assert(Size >= 4 && "invalid sgpr spill size" ); |
418 | assert(ST.getRegisterInfo()->spillSGPRToVGPR() && |
419 | "not spilling SGPRs to VGPRs" ); |
420 | |
421 | unsigned &NumSpillLanes = SpillToPhysVGPRLane ? NumPhysicalVGPRSpillLanes |
422 | : NumVirtualVGPRSpillLanes; |
423 | |
424 | for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) { |
425 | unsigned LaneIndex = (NumSpillLanes % WaveSize); |
426 | |
427 | bool Allocated = SpillToPhysVGPRLane |
428 | ? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex, |
429 | IsPrologEpilog) |
430 | : allocateVirtualVGPRForSGPRSpills(MF, FI, LaneIndex); |
431 | if (!Allocated) { |
432 | NumSpillLanes -= I; |
433 | return false; |
434 | } |
435 | } |
436 | |
437 | return true; |
438 | } |
439 | |
440 | /// Reserve AGPRs or VGPRs to support spilling for FrameIndex \p FI. |
441 | /// Either AGPR is spilled to VGPR to vice versa. |
442 | /// Returns true if a \p FI can be eliminated completely. |
443 | bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF, |
444 | int FI, |
445 | bool isAGPRtoVGPR) { |
446 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
447 | MachineFrameInfo &FrameInfo = MF.getFrameInfo(); |
448 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
449 | |
450 | assert(ST.hasMAIInsts() && FrameInfo.isSpillSlotObjectIndex(FI)); |
451 | |
452 | auto &Spill = VGPRToAGPRSpills[FI]; |
453 | |
454 | // This has already been allocated. |
455 | if (!Spill.Lanes.empty()) |
456 | return Spill.FullyAllocated; |
457 | |
458 | unsigned Size = FrameInfo.getObjectSize(ObjectIdx: FI); |
459 | unsigned NumLanes = Size / 4; |
460 | Spill.Lanes.resize(N: NumLanes, NV: AMDGPU::NoRegister); |
461 | |
462 | const TargetRegisterClass &RC = |
463 | isAGPRtoVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::AGPR_32RegClass; |
464 | auto Regs = RC.getRegisters(); |
465 | |
466 | auto &SpillRegs = isAGPRtoVGPR ? SpillAGPR : SpillVGPR; |
467 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
468 | Spill.FullyAllocated = true; |
469 | |
470 | // FIXME: Move allocation logic out of MachineFunctionInfo and initialize |
471 | // once. |
472 | BitVector OtherUsedRegs; |
473 | OtherUsedRegs.resize(N: TRI->getNumRegs()); |
474 | |
475 | const uint32_t *CSRMask = |
476 | TRI->getCallPreservedMask(MF, MF.getFunction().getCallingConv()); |
477 | if (CSRMask) |
478 | OtherUsedRegs.setBitsInMask(Mask: CSRMask); |
479 | |
480 | // TODO: Should include register tuples, but doesn't matter with current |
481 | // usage. |
482 | for (MCPhysReg Reg : SpillAGPR) |
483 | OtherUsedRegs.set(Reg); |
484 | for (MCPhysReg Reg : SpillVGPR) |
485 | OtherUsedRegs.set(Reg); |
486 | |
487 | SmallVectorImpl<MCPhysReg>::const_iterator NextSpillReg = Regs.begin(); |
488 | for (int I = NumLanes - 1; I >= 0; --I) { |
489 | NextSpillReg = std::find_if( |
490 | NextSpillReg, Regs.end(), [&MRI, &OtherUsedRegs](MCPhysReg Reg) { |
491 | return MRI.isAllocatable(PhysReg: Reg) && !MRI.isPhysRegUsed(PhysReg: Reg) && |
492 | !OtherUsedRegs[Reg]; |
493 | }); |
494 | |
495 | if (NextSpillReg == Regs.end()) { // Registers exhausted |
496 | Spill.FullyAllocated = false; |
497 | break; |
498 | } |
499 | |
500 | OtherUsedRegs.set(*NextSpillReg); |
501 | SpillRegs.push_back(Elt: *NextSpillReg); |
502 | MRI.reserveReg(*NextSpillReg, TRI); |
503 | Spill.Lanes[I] = *NextSpillReg++; |
504 | } |
505 | |
506 | return Spill.FullyAllocated; |
507 | } |
508 | |
509 | bool SIMachineFunctionInfo::removeDeadFrameIndices( |
510 | MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) { |
511 | // Remove dead frame indices from function frame, however keep FP & BP since |
512 | // spills for them haven't been inserted yet. And also make sure to remove the |
513 | // frame indices from `SGPRSpillsToVirtualVGPRLanes` data structure, |
514 | // otherwise, it could result in an unexpected side effect and bug, in case of |
515 | // any re-mapping of freed frame indices by later pass(es) like "stack slot |
516 | // coloring". |
517 | for (auto &R : make_early_inc_range(Range&: SGPRSpillsToVirtualVGPRLanes)) { |
518 | MFI.RemoveStackObject(ObjectIdx: R.first); |
519 | SGPRSpillsToVirtualVGPRLanes.erase(Val: R.first); |
520 | } |
521 | |
522 | // Remove the dead frame indices of CSR SGPRs which are spilled to physical |
523 | // VGPR lanes during SILowerSGPRSpills pass. |
524 | if (!ResetSGPRSpillStackIDs) { |
525 | for (auto &R : make_early_inc_range(Range&: SGPRSpillsToPhysicalVGPRLanes)) { |
526 | MFI.RemoveStackObject(ObjectIdx: R.first); |
527 | SGPRSpillsToPhysicalVGPRLanes.erase(Val: R.first); |
528 | } |
529 | } |
530 | bool HaveSGPRToMemory = false; |
531 | |
532 | if (ResetSGPRSpillStackIDs) { |
533 | // All other SGPRs must be allocated on the default stack, so reset the |
534 | // stack ID. |
535 | for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); I != E; |
536 | ++I) { |
537 | if (!checkIndexInPrologEpilogSGPRSpills(FI: I)) { |
538 | if (MFI.getStackID(ObjectIdx: I) == TargetStackID::SGPRSpill) { |
539 | MFI.setStackID(ObjectIdx: I, ID: TargetStackID::Default); |
540 | HaveSGPRToMemory = true; |
541 | } |
542 | } |
543 | } |
544 | } |
545 | |
546 | for (auto &R : VGPRToAGPRSpills) { |
547 | if (R.second.IsDead) |
548 | MFI.RemoveStackObject(ObjectIdx: R.first); |
549 | } |
550 | |
551 | return HaveSGPRToMemory; |
552 | } |
553 | |
554 | int SIMachineFunctionInfo::getScavengeFI(MachineFrameInfo &MFI, |
555 | const SIRegisterInfo &TRI) { |
556 | if (ScavengeFI) |
557 | return *ScavengeFI; |
558 | |
559 | ScavengeFI = |
560 | MFI.CreateStackObject(TRI.getSpillSize(AMDGPU::SGPR_32RegClass), |
561 | TRI.getSpillAlign(AMDGPU::SGPR_32RegClass), false); |
562 | return *ScavengeFI; |
563 | } |
564 | |
565 | MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const { |
566 | assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs" ); |
567 | return AMDGPU::SGPR0 + NumUserSGPRs; |
568 | } |
569 | |
570 | MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const { |
571 | return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs; |
572 | } |
573 | |
574 | void SIMachineFunctionInfo::MRI_NoteNewVirtualRegister(Register Reg) { |
575 | VRegFlags.grow(n: Reg); |
576 | } |
577 | |
578 | void SIMachineFunctionInfo::MRI_NoteCloneVirtualRegister(Register NewReg, |
579 | Register SrcReg) { |
580 | VRegFlags.grow(n: NewReg); |
581 | VRegFlags[NewReg] = VRegFlags[SrcReg]; |
582 | } |
583 | |
584 | Register |
585 | SIMachineFunctionInfo::getGITPtrLoReg(const MachineFunction &MF) const { |
586 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
587 | if (!ST.isAmdPalOS()) |
588 | return Register(); |
589 | Register GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in |
590 | if (ST.hasMergedShaders()) { |
591 | switch (MF.getFunction().getCallingConv()) { |
592 | case CallingConv::AMDGPU_HS: |
593 | case CallingConv::AMDGPU_GS: |
594 | // Low GIT address is passed in s8 rather than s0 for an LS+HS or |
595 | // ES+GS merged shader on gfx9+. |
596 | GitPtrLo = AMDGPU::SGPR8; |
597 | return GitPtrLo; |
598 | default: |
599 | return GitPtrLo; |
600 | } |
601 | } |
602 | return GitPtrLo; |
603 | } |
604 | |
605 | static yaml::StringValue regToString(Register Reg, |
606 | const TargetRegisterInfo &TRI) { |
607 | yaml::StringValue Dest; |
608 | { |
609 | raw_string_ostream OS(Dest.Value); |
610 | OS << printReg(Reg, TRI: &TRI); |
611 | } |
612 | return Dest; |
613 | } |
614 | |
615 | static std::optional<yaml::SIArgumentInfo> |
616 | convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo, |
617 | const TargetRegisterInfo &TRI) { |
618 | yaml::SIArgumentInfo AI; |
619 | |
620 | auto convertArg = [&](std::optional<yaml::SIArgument> &A, |
621 | const ArgDescriptor &Arg) { |
622 | if (!Arg) |
623 | return false; |
624 | |
625 | // Create a register or stack argument. |
626 | yaml::SIArgument SA = yaml::SIArgument::createArgument(IsReg: Arg.isRegister()); |
627 | if (Arg.isRegister()) { |
628 | raw_string_ostream OS(SA.RegisterName.Value); |
629 | OS << printReg(Reg: Arg.getRegister(), TRI: &TRI); |
630 | } else |
631 | SA.StackOffset = Arg.getStackOffset(); |
632 | // Check and update the optional mask. |
633 | if (Arg.isMasked()) |
634 | SA.Mask = Arg.getMask(); |
635 | |
636 | A = SA; |
637 | return true; |
638 | }; |
639 | |
640 | // TODO: Need to serialize kernarg preloads. |
641 | bool Any = false; |
642 | Any |= convertArg(AI.PrivateSegmentBuffer, ArgInfo.PrivateSegmentBuffer); |
643 | Any |= convertArg(AI.DispatchPtr, ArgInfo.DispatchPtr); |
644 | Any |= convertArg(AI.QueuePtr, ArgInfo.QueuePtr); |
645 | Any |= convertArg(AI.KernargSegmentPtr, ArgInfo.KernargSegmentPtr); |
646 | Any |= convertArg(AI.DispatchID, ArgInfo.DispatchID); |
647 | Any |= convertArg(AI.FlatScratchInit, ArgInfo.FlatScratchInit); |
648 | Any |= convertArg(AI.LDSKernelId, ArgInfo.LDSKernelId); |
649 | Any |= convertArg(AI.PrivateSegmentSize, ArgInfo.PrivateSegmentSize); |
650 | Any |= convertArg(AI.WorkGroupIDX, ArgInfo.WorkGroupIDX); |
651 | Any |= convertArg(AI.WorkGroupIDY, ArgInfo.WorkGroupIDY); |
652 | Any |= convertArg(AI.WorkGroupIDZ, ArgInfo.WorkGroupIDZ); |
653 | Any |= convertArg(AI.WorkGroupInfo, ArgInfo.WorkGroupInfo); |
654 | Any |= convertArg(AI.PrivateSegmentWaveByteOffset, |
655 | ArgInfo.PrivateSegmentWaveByteOffset); |
656 | Any |= convertArg(AI.ImplicitArgPtr, ArgInfo.ImplicitArgPtr); |
657 | Any |= convertArg(AI.ImplicitBufferPtr, ArgInfo.ImplicitBufferPtr); |
658 | Any |= convertArg(AI.WorkItemIDX, ArgInfo.WorkItemIDX); |
659 | Any |= convertArg(AI.WorkItemIDY, ArgInfo.WorkItemIDY); |
660 | Any |= convertArg(AI.WorkItemIDZ, ArgInfo.WorkItemIDZ); |
661 | |
662 | if (Any) |
663 | return AI; |
664 | |
665 | return std::nullopt; |
666 | } |
667 | |
668 | yaml::SIMachineFunctionInfo::SIMachineFunctionInfo( |
669 | const llvm::SIMachineFunctionInfo &MFI, const TargetRegisterInfo &TRI, |
670 | const llvm::MachineFunction &MF) |
671 | : ExplicitKernArgSize(MFI.getExplicitKernArgSize()), |
672 | MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()), |
673 | GDSSize(MFI.getGDSSize()), |
674 | DynLDSAlign(MFI.getDynLDSAlign()), IsEntryFunction(MFI.isEntryFunction()), |
675 | NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()), |
676 | MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()), |
677 | HasSpilledSGPRs(MFI.hasSpilledSGPRs()), |
678 | HasSpilledVGPRs(MFI.hasSpilledVGPRs()), |
679 | HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()), |
680 | Occupancy(MFI.getOccupancy()), |
681 | ScratchRSrcReg(regToString(Reg: MFI.getScratchRSrcReg(), TRI)), |
682 | FrameOffsetReg(regToString(Reg: MFI.getFrameOffsetReg(), TRI)), |
683 | StackPtrOffsetReg(regToString(Reg: MFI.getStackPtrOffsetReg(), TRI)), |
684 | BytesInStackArgArea(MFI.getBytesInStackArgArea()), |
685 | ReturnsVoid(MFI.returnsVoid()), |
686 | ArgInfo(convertArgumentInfo(ArgInfo: MFI.getArgInfo(), TRI)), |
687 | PSInputAddr(MFI.getPSInputAddr()), |
688 | PSInputEnable(MFI.getPSInputEnable()), |
689 | Mode(MFI.getMode()) { |
690 | for (Register Reg : MFI.getWWMReservedRegs()) |
691 | WWMReservedRegs.push_back(Elt: regToString(Reg, TRI)); |
692 | |
693 | if (MFI.getLongBranchReservedReg()) |
694 | LongBranchReservedReg = regToString(Reg: MFI.getLongBranchReservedReg(), TRI); |
695 | if (MFI.getVGPRForAGPRCopy()) |
696 | VGPRForAGPRCopy = regToString(Reg: MFI.getVGPRForAGPRCopy(), TRI); |
697 | |
698 | if (MFI.getSGPRForEXECCopy()) |
699 | SGPRForEXECCopy = regToString(Reg: MFI.getSGPRForEXECCopy(), TRI); |
700 | |
701 | auto SFI = MFI.getOptionalScavengeFI(); |
702 | if (SFI) |
703 | ScavengeFI = yaml::FrameIndex(*SFI, MF.getFrameInfo()); |
704 | } |
705 | |
706 | void yaml::SIMachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) { |
707 | MappingTraits<SIMachineFunctionInfo>::mapping(YamlIO, MFI&: *this); |
708 | } |
709 | |
710 | bool SIMachineFunctionInfo::initializeBaseYamlFields( |
711 | const yaml::SIMachineFunctionInfo &YamlMFI, const MachineFunction &MF, |
712 | PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange) { |
713 | ExplicitKernArgSize = YamlMFI.ExplicitKernArgSize; |
714 | MaxKernArgAlign = YamlMFI.MaxKernArgAlign; |
715 | LDSSize = YamlMFI.LDSSize; |
716 | GDSSize = YamlMFI.GDSSize; |
717 | DynLDSAlign = YamlMFI.DynLDSAlign; |
718 | PSInputAddr = YamlMFI.PSInputAddr; |
719 | PSInputEnable = YamlMFI.PSInputEnable; |
720 | HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress; |
721 | Occupancy = YamlMFI.Occupancy; |
722 | IsEntryFunction = YamlMFI.IsEntryFunction; |
723 | NoSignedZerosFPMath = YamlMFI.NoSignedZerosFPMath; |
724 | MemoryBound = YamlMFI.MemoryBound; |
725 | WaveLimiter = YamlMFI.WaveLimiter; |
726 | HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs; |
727 | HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs; |
728 | BytesInStackArgArea = YamlMFI.BytesInStackArgArea; |
729 | ReturnsVoid = YamlMFI.ReturnsVoid; |
730 | |
731 | if (YamlMFI.ScavengeFI) { |
732 | auto FIOrErr = YamlMFI.ScavengeFI->getFI(MFI: MF.getFrameInfo()); |
733 | if (!FIOrErr) { |
734 | // Create a diagnostic for a the frame index. |
735 | const MemoryBuffer &Buffer = |
736 | *PFS.SM->getMemoryBuffer(i: PFS.SM->getMainFileID()); |
737 | |
738 | Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1, 1, |
739 | SourceMgr::DK_Error, toString(E: FIOrErr.takeError()), |
740 | "" , std::nullopt, std::nullopt); |
741 | SourceRange = YamlMFI.ScavengeFI->SourceRange; |
742 | return true; |
743 | } |
744 | ScavengeFI = *FIOrErr; |
745 | } else { |
746 | ScavengeFI = std::nullopt; |
747 | } |
748 | return false; |
749 | } |
750 | |
751 | bool SIMachineFunctionInfo::mayUseAGPRs(const Function &F) const { |
752 | return !F.hasFnAttribute(Kind: "amdgpu-no-agpr" ); |
753 | } |
754 | |
755 | bool SIMachineFunctionInfo::usesAGPRs(const MachineFunction &MF) const { |
756 | if (UsesAGPRs) |
757 | return *UsesAGPRs; |
758 | |
759 | if (!mayNeedAGPRs()) { |
760 | UsesAGPRs = false; |
761 | return false; |
762 | } |
763 | |
764 | if (!AMDGPU::isEntryFunctionCC(CC: MF.getFunction().getCallingConv()) || |
765 | MF.getFrameInfo().hasCalls()) { |
766 | UsesAGPRs = true; |
767 | return true; |
768 | } |
769 | |
770 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
771 | |
772 | for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { |
773 | const Register Reg = Register::index2VirtReg(Index: I); |
774 | const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); |
775 | if (RC && SIRegisterInfo::isAGPRClass(RC)) { |
776 | UsesAGPRs = true; |
777 | return true; |
778 | } else if (!RC && !MRI.use_empty(RegNo: Reg) && MRI.getType(Reg).isValid()) { |
779 | // Defer caching UsesAGPRs, function might not yet been regbank selected. |
780 | return true; |
781 | } |
782 | } |
783 | |
784 | for (MCRegister Reg : AMDGPU::AGPR_32RegClass) { |
785 | if (MRI.isPhysRegUsed(Reg)) { |
786 | UsesAGPRs = true; |
787 | return true; |
788 | } |
789 | } |
790 | |
791 | UsesAGPRs = false; |
792 | return false; |
793 | } |
794 | |