1 | //===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// |
11 | /// The AMDGPUAsmPrinter is used to print both assembly string and also binary |
12 | /// code. When passed an MCAsmStreamer it prints assembly and when passed |
13 | /// an MCObjectStreamer it outputs binary code. |
14 | // |
15 | //===----------------------------------------------------------------------===// |
16 | // |
17 | |
18 | #include "AMDGPUAsmPrinter.h" |
19 | #include "AMDGPU.h" |
20 | #include "AMDGPUHSAMetadataStreamer.h" |
21 | #include "AMDGPUResourceUsageAnalysis.h" |
22 | #include "AMDKernelCodeT.h" |
23 | #include "GCNSubtarget.h" |
24 | #include "MCTargetDesc/AMDGPUInstPrinter.h" |
25 | #include "MCTargetDesc/AMDGPUMCKernelDescriptor.h" |
26 | #include "MCTargetDesc/AMDGPUTargetStreamer.h" |
27 | #include "R600AsmPrinter.h" |
28 | #include "SIMachineFunctionInfo.h" |
29 | #include "TargetInfo/AMDGPUTargetInfo.h" |
30 | #include "Utils/AMDGPUBaseInfo.h" |
31 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
32 | #include "llvm/BinaryFormat/ELF.h" |
33 | #include "llvm/CodeGen/MachineFrameInfo.h" |
34 | #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" |
35 | #include "llvm/IR/DiagnosticInfo.h" |
36 | #include "llvm/MC/MCAssembler.h" |
37 | #include "llvm/MC/MCContext.h" |
38 | #include "llvm/MC/MCSectionELF.h" |
39 | #include "llvm/MC/MCStreamer.h" |
40 | #include "llvm/MC/TargetRegistry.h" |
41 | #include "llvm/Support/AMDHSAKernelDescriptor.h" |
42 | #include "llvm/Target/TargetLoweringObjectFile.h" |
43 | #include "llvm/Target/TargetMachine.h" |
44 | #include "llvm/TargetParser/TargetParser.h" |
45 | |
46 | using namespace llvm; |
47 | using namespace llvm::AMDGPU; |
48 | |
49 | // This should get the default rounding mode from the kernel. We just set the |
50 | // default here, but this could change if the OpenCL rounding mode pragmas are |
51 | // used. |
52 | // |
53 | // The denormal mode here should match what is reported by the OpenCL runtime |
54 | // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but |
55 | // can also be override to flush with the -cl-denorms-are-zero compiler flag. |
56 | // |
57 | // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double |
58 | // precision, and leaves single precision to flush all and does not report |
59 | // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports |
60 | // CL_FP_DENORM for both. |
61 | // |
62 | // FIXME: It seems some instructions do not support single precision denormals |
63 | // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32, |
64 | // and sin_f32, cos_f32 on most parts). |
65 | |
66 | // We want to use these instructions, and using fp32 denormals also causes |
67 | // instructions to run at the double precision rate for the device so it's |
68 | // probably best to just report no single precision denormals. |
69 | static uint32_t getFPMode(SIModeRegisterDefaults Mode) { |
70 | return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) | |
71 | FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) | |
72 | FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) | |
73 | FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue()); |
74 | } |
75 | |
76 | static AsmPrinter * |
77 | createAMDGPUAsmPrinterPass(TargetMachine &tm, |
78 | std::unique_ptr<MCStreamer> &&Streamer) { |
79 | return new AMDGPUAsmPrinter(tm, std::move(Streamer)); |
80 | } |
81 | |
82 | extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter() { |
83 | TargetRegistry::RegisterAsmPrinter(T&: getTheR600Target(), |
84 | Fn: llvm::createR600AsmPrinterPass); |
85 | TargetRegistry::RegisterAsmPrinter(T&: getTheGCNTarget(), |
86 | Fn: createAMDGPUAsmPrinterPass); |
87 | } |
88 | |
89 | AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, |
90 | std::unique_ptr<MCStreamer> Streamer) |
91 | : AsmPrinter(TM, std::move(Streamer)) { |
92 | assert(OutStreamer && "AsmPrinter constructed without streamer" ); |
93 | } |
94 | |
95 | StringRef AMDGPUAsmPrinter::getPassName() const { |
96 | return "AMDGPU Assembly Printer" ; |
97 | } |
98 | |
99 | const MCSubtargetInfo *AMDGPUAsmPrinter::getGlobalSTI() const { |
100 | return TM.getMCSubtargetInfo(); |
101 | } |
102 | |
103 | AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const { |
104 | if (!OutStreamer) |
105 | return nullptr; |
106 | return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer()); |
107 | } |
108 | |
109 | void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) { |
110 | IsTargetStreamerInitialized = false; |
111 | } |
112 | |
113 | void AMDGPUAsmPrinter::initTargetStreamer(Module &M) { |
114 | IsTargetStreamerInitialized = true; |
115 | |
116 | // TODO: Which one is called first, emitStartOfAsmFile or |
117 | // emitFunctionBodyStart? |
118 | if (getTargetStreamer() && !getTargetStreamer()->getTargetID()) |
119 | initializeTargetID(M); |
120 | |
121 | if (TM.getTargetTriple().getOS() != Triple::AMDHSA && |
122 | TM.getTargetTriple().getOS() != Triple::AMDPAL) |
123 | return; |
124 | |
125 | getTargetStreamer()->EmitDirectiveAMDGCNTarget(); |
126 | |
127 | if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { |
128 | getTargetStreamer()->EmitDirectiveAMDHSACodeObjectVersion( |
129 | COV: CodeObjectVersion); |
130 | HSAMetadataStream->begin(Mod: M, TargetID: *getTargetStreamer()->getTargetID()); |
131 | } |
132 | |
133 | if (TM.getTargetTriple().getOS() == Triple::AMDPAL) |
134 | getTargetStreamer()->getPALMetadata()->readFromIR(M); |
135 | } |
136 | |
137 | void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) { |
138 | // Init target streamer if it has not yet happened |
139 | if (!IsTargetStreamerInitialized) |
140 | initTargetStreamer(M); |
141 | |
142 | if (TM.getTargetTriple().getOS() != Triple::AMDHSA) |
143 | getTargetStreamer()->EmitISAVersion(); |
144 | |
145 | // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA). |
146 | // Emit HSA Metadata (NT_AMD_HSA_METADATA). |
147 | if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { |
148 | HSAMetadataStream->end(); |
149 | bool Success = HSAMetadataStream->emitTo(TargetStreamer&: *getTargetStreamer()); |
150 | (void)Success; |
151 | assert(Success && "Malformed HSA Metadata" ); |
152 | } |
153 | } |
154 | |
155 | void AMDGPUAsmPrinter::emitFunctionBodyStart() { |
156 | const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); |
157 | const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); |
158 | const Function &F = MF->getFunction(); |
159 | |
160 | // TODO: We're checking this late, would be nice to check it earlier. |
161 | if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) { |
162 | report_fatal_error( |
163 | STM.getCPU() + " is only available on code object version 6 or better" , |
164 | /*gen_crash_diag*/ false); |
165 | } |
166 | |
167 | // TODO: Which one is called first, emitStartOfAsmFile or |
168 | // emitFunctionBodyStart? |
169 | if (!getTargetStreamer()->getTargetID()) |
170 | initializeTargetID(M: *F.getParent()); |
171 | |
172 | const auto &FunctionTargetID = STM.getTargetID(); |
173 | // Make sure function's xnack settings are compatible with module's |
174 | // xnack settings. |
175 | if (FunctionTargetID.isXnackSupported() && |
176 | FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any && |
177 | FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) { |
178 | OutContext.reportError(L: {}, Msg: "xnack setting of '" + Twine(MF->getName()) + |
179 | "' function does not match module xnack setting" ); |
180 | return; |
181 | } |
182 | // Make sure function's sramecc settings are compatible with module's |
183 | // sramecc settings. |
184 | if (FunctionTargetID.isSramEccSupported() && |
185 | FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any && |
186 | FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) { |
187 | OutContext.reportError(L: {}, Msg: "sramecc setting of '" + Twine(MF->getName()) + |
188 | "' function does not match module sramecc setting" ); |
189 | return; |
190 | } |
191 | |
192 | if (!MFI.isEntryFunction()) |
193 | return; |
194 | |
195 | if (STM.isMesaKernel(F) && |
196 | (F.getCallingConv() == CallingConv::AMDGPU_KERNEL || |
197 | F.getCallingConv() == CallingConv::SPIR_KERNEL)) { |
198 | amd_kernel_code_t KernelCode; |
199 | getAmdKernelCode(Out&: KernelCode, KernelInfo: CurrentProgramInfo, MF: *MF); |
200 | getTargetStreamer()->EmitAMDKernelCodeT(Header: KernelCode); |
201 | } |
202 | |
203 | if (STM.isAmdHsaOS()) |
204 | HSAMetadataStream->emitKernel(MF: *MF, ProgramInfo: CurrentProgramInfo); |
205 | |
206 | if (MFI.getNumKernargPreloadedSGPRs() > 0) { |
207 | assert(AMDGPU::hasKernargPreload(STM)); |
208 | getTargetStreamer()->EmitKernargPreloadHeader(STI: *getGlobalSTI(), |
209 | TrapEnabled: STM.isAmdHsaOS()); |
210 | } |
211 | } |
212 | |
213 | void AMDGPUAsmPrinter::emitFunctionBodyEnd() { |
214 | const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); |
215 | if (!MFI.isEntryFunction()) |
216 | return; |
217 | |
218 | if (TM.getTargetTriple().getOS() != Triple::AMDHSA) |
219 | return; |
220 | |
221 | auto &Streamer = getTargetStreamer()->getStreamer(); |
222 | auto &Context = Streamer.getContext(); |
223 | auto &ObjectFileInfo = *Context.getObjectFileInfo(); |
224 | auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection(); |
225 | |
226 | Streamer.pushSection(); |
227 | Streamer.switchSection(Section: &ReadOnlySection); |
228 | |
229 | // CP microcode requires the kernel descriptor to be allocated on 64 byte |
230 | // alignment. |
231 | Streamer.emitValueToAlignment(Alignment: Align(64), Value: 0, ValueSize: 1, MaxBytesToEmit: 0); |
232 | ReadOnlySection.ensureMinAlignment(MinAlignment: Align(64)); |
233 | |
234 | const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); |
235 | |
236 | SmallString<128> KernelName; |
237 | getNameWithPrefix(Name&: KernelName, GV: &MF->getFunction()); |
238 | getTargetStreamer()->EmitAmdhsaKernelDescriptor( |
239 | STM, KernelName, getAmdhsaKernelDescriptor(MF: *MF, PI: CurrentProgramInfo), |
240 | CurrentProgramInfo.NumVGPRsForWavesPerEU, |
241 | CurrentProgramInfo.NumSGPRsForWavesPerEU - |
242 | IsaInfo::getNumExtraSGPRs( |
243 | &STM, CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed, |
244 | getTargetStreamer()->getTargetID()->isXnackOnOrAny()), |
245 | CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed); |
246 | |
247 | Streamer.popSection(); |
248 | } |
249 | |
250 | void AMDGPUAsmPrinter::emitImplicitDef(const MachineInstr *MI) const { |
251 | Register RegNo = MI->getOperand(i: 0).getReg(); |
252 | |
253 | SmallString<128> Str; |
254 | raw_svector_ostream OS(Str); |
255 | OS << "implicit-def: " |
256 | << printReg(Reg: RegNo, TRI: MF->getSubtarget().getRegisterInfo()); |
257 | |
258 | if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL) |
259 | OS << " : SGPR spill to VGPR lane" ; |
260 | |
261 | OutStreamer->AddComment(T: OS.str()); |
262 | OutStreamer->addBlankLine(); |
263 | } |
264 | |
265 | void AMDGPUAsmPrinter::emitFunctionEntryLabel() { |
266 | if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { |
267 | AsmPrinter::emitFunctionEntryLabel(); |
268 | return; |
269 | } |
270 | |
271 | const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); |
272 | const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); |
273 | if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) { |
274 | SmallString<128> SymbolName; |
275 | getNameWithPrefix(Name&: SymbolName, GV: &MF->getFunction()), |
276 | getTargetStreamer()->EmitAMDGPUSymbolType( |
277 | SymbolName, Type: ELF::STT_AMDGPU_HSA_KERNEL); |
278 | } |
279 | if (DumpCodeInstEmitter) { |
280 | // Disassemble function name label to text. |
281 | DisasmLines.push_back(x: MF->getName().str() + ":" ); |
282 | DisasmLineMaxLen = std::max(a: DisasmLineMaxLen, b: DisasmLines.back().size()); |
283 | HexLines.push_back(x: "" ); |
284 | } |
285 | |
286 | AsmPrinter::emitFunctionEntryLabel(); |
287 | } |
288 | |
289 | void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) { |
290 | if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(MBB: &MBB)) { |
291 | // Write a line for the basic block label if it is not only fallthrough. |
292 | DisasmLines.push_back( |
293 | x: (Twine("BB" ) + Twine(getFunctionNumber()) |
294 | + "_" + Twine(MBB.getNumber()) + ":" ).str()); |
295 | DisasmLineMaxLen = std::max(a: DisasmLineMaxLen, b: DisasmLines.back().size()); |
296 | HexLines.push_back(x: "" ); |
297 | } |
298 | AsmPrinter::emitBasicBlockStart(MBB); |
299 | } |
300 | |
301 | void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { |
302 | if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { |
303 | if (GV->hasInitializer() && !isa<UndefValue>(Val: GV->getInitializer())) { |
304 | OutContext.reportError(L: {}, |
305 | Msg: Twine(GV->getName()) + |
306 | ": unsupported initializer for address space" ); |
307 | return; |
308 | } |
309 | |
310 | // LDS variables aren't emitted in HSA or PAL yet. |
311 | const Triple::OSType OS = TM.getTargetTriple().getOS(); |
312 | if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) |
313 | return; |
314 | |
315 | MCSymbol *GVSym = getSymbol(GV); |
316 | |
317 | GVSym->redefineIfPossible(); |
318 | if (GVSym->isDefined() || GVSym->isVariable()) |
319 | report_fatal_error(reason: "symbol '" + Twine(GVSym->getName()) + |
320 | "' is already defined" ); |
321 | |
322 | const DataLayout &DL = GV->getParent()->getDataLayout(); |
323 | uint64_t Size = DL.getTypeAllocSize(Ty: GV->getValueType()); |
324 | Align Alignment = GV->getAlign().value_or(u: Align(4)); |
325 | |
326 | emitVisibility(Sym: GVSym, Visibility: GV->getVisibility(), IsDefinition: !GV->isDeclaration()); |
327 | emitLinkage(GV, GVSym); |
328 | auto TS = getTargetStreamer(); |
329 | TS->emitAMDGPULDS(Symbol: GVSym, Size, Alignment); |
330 | return; |
331 | } |
332 | |
333 | AsmPrinter::emitGlobalVariable(GV); |
334 | } |
335 | |
336 | bool AMDGPUAsmPrinter::doInitialization(Module &M) { |
337 | CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M); |
338 | |
339 | if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { |
340 | switch (CodeObjectVersion) { |
341 | case AMDGPU::AMDHSA_COV4: |
342 | HSAMetadataStream.reset(p: new HSAMD::MetadataStreamerMsgPackV4()); |
343 | break; |
344 | case AMDGPU::AMDHSA_COV5: |
345 | HSAMetadataStream.reset(p: new HSAMD::MetadataStreamerMsgPackV5()); |
346 | break; |
347 | case AMDGPU::AMDHSA_COV6: |
348 | HSAMetadataStream.reset(p: new HSAMD::MetadataStreamerMsgPackV6()); |
349 | break; |
350 | default: |
351 | report_fatal_error(reason: "Unexpected code object version" ); |
352 | } |
353 | } |
354 | return AsmPrinter::doInitialization(M); |
355 | } |
356 | |
357 | bool AMDGPUAsmPrinter::doFinalization(Module &M) { |
358 | // Pad with s_code_end to help tools and guard against instruction prefetch |
359 | // causing stale data in caches. Arguably this should be done by the linker, |
360 | // which is why this isn't done for Mesa. |
361 | const MCSubtargetInfo &STI = *getGlobalSTI(); |
362 | if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) && |
363 | (STI.getTargetTriple().getOS() == Triple::AMDHSA || |
364 | STI.getTargetTriple().getOS() == Triple::AMDPAL)) { |
365 | OutStreamer->switchSection(Section: getObjFileLowering().getTextSection()); |
366 | getTargetStreamer()->EmitCodeEnd(STI); |
367 | } |
368 | |
369 | return AsmPrinter::doFinalization(M); |
370 | } |
371 | |
372 | // Print comments that apply to both callable functions and entry points. |
373 | void AMDGPUAsmPrinter::( |
374 | uint32_t NumVGPR, std::optional<uint32_t> NumAGPR, uint32_t TotalNumVGPR, |
375 | uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize, |
376 | const AMDGPUMachineFunction *MFI) { |
377 | OutStreamer->emitRawComment(T: " codeLenInByte = " + Twine(CodeSize), TabPrefix: false); |
378 | OutStreamer->emitRawComment(T: " NumSgprs: " + Twine(NumSGPR), TabPrefix: false); |
379 | OutStreamer->emitRawComment(T: " NumVgprs: " + Twine(NumVGPR), TabPrefix: false); |
380 | if (NumAGPR) { |
381 | OutStreamer->emitRawComment(T: " NumAgprs: " + Twine(*NumAGPR), TabPrefix: false); |
382 | OutStreamer->emitRawComment(T: " TotalNumVgprs: " + Twine(TotalNumVGPR), |
383 | TabPrefix: false); |
384 | } |
385 | OutStreamer->emitRawComment(T: " ScratchSize: " + Twine(ScratchSize), TabPrefix: false); |
386 | OutStreamer->emitRawComment(T: " MemoryBound: " + Twine(MFI->isMemoryBound()), |
387 | TabPrefix: false); |
388 | } |
389 | |
390 | uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( |
391 | const MachineFunction &MF) const { |
392 | const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); |
393 | uint16_t KernelCodeProperties = 0; |
394 | const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo(); |
395 | |
396 | if (UserSGPRInfo.hasPrivateSegmentBuffer()) { |
397 | KernelCodeProperties |= |
398 | amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; |
399 | } |
400 | if (UserSGPRInfo.hasDispatchPtr()) { |
401 | KernelCodeProperties |= |
402 | amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; |
403 | } |
404 | if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) { |
405 | KernelCodeProperties |= |
406 | amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; |
407 | } |
408 | if (UserSGPRInfo.hasKernargSegmentPtr()) { |
409 | KernelCodeProperties |= |
410 | amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; |
411 | } |
412 | if (UserSGPRInfo.hasDispatchID()) { |
413 | KernelCodeProperties |= |
414 | amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; |
415 | } |
416 | if (UserSGPRInfo.hasFlatScratchInit()) { |
417 | KernelCodeProperties |= |
418 | amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; |
419 | } |
420 | if (MF.getSubtarget<GCNSubtarget>().isWave32()) { |
421 | KernelCodeProperties |= |
422 | amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32; |
423 | } |
424 | |
425 | if (CurrentProgramInfo.DynamicCallStack && |
426 | CodeObjectVersion >= AMDGPU::AMDHSA_COV5) |
427 | KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK; |
428 | |
429 | return KernelCodeProperties; |
430 | } |
431 | |
432 | MCKernelDescriptor |
433 | AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF, |
434 | const SIProgramInfo &PI) const { |
435 | const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); |
436 | const Function &F = MF.getFunction(); |
437 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
438 | MCContext &Ctx = MF.getContext(); |
439 | |
440 | MCKernelDescriptor KernelDescriptor; |
441 | |
442 | assert(isUInt<32>(PI.ScratchSize)); |
443 | assert(isUInt<32>(PI.getComputePGMRSrc1(STM))); |
444 | assert(isUInt<32>(PI.getComputePGMRSrc2())); |
445 | |
446 | KernelDescriptor.group_segment_fixed_size = |
447 | MCConstantExpr::create(Value: PI.LDSSize, Ctx); |
448 | KernelDescriptor.private_segment_fixed_size = |
449 | MCConstantExpr::create(Value: PI.ScratchSize, Ctx); |
450 | |
451 | Align MaxKernArgAlign; |
452 | KernelDescriptor.kernarg_size = MCConstantExpr::create( |
453 | Value: STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx); |
454 | |
455 | KernelDescriptor.compute_pgm_rsrc1 = |
456 | MCConstantExpr::create(Value: PI.getComputePGMRSrc1(ST: STM), Ctx); |
457 | KernelDescriptor.compute_pgm_rsrc2 = |
458 | MCConstantExpr::create(Value: PI.getComputePGMRSrc2(), Ctx); |
459 | KernelDescriptor.kernel_code_properties = |
460 | MCConstantExpr::create(Value: getAmdhsaKernelCodeProperties(MF), Ctx); |
461 | |
462 | assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0); |
463 | KernelDescriptor.compute_pgm_rsrc3 = MCConstantExpr::create( |
464 | Value: STM.hasGFX90AInsts() ? CurrentProgramInfo.ComputePGMRSrc3GFX90A : 0, Ctx); |
465 | |
466 | KernelDescriptor.kernarg_preload = MCConstantExpr::create( |
467 | Value: AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0, |
468 | Ctx); |
469 | |
470 | return KernelDescriptor; |
471 | } |
472 | |
473 | bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { |
474 | // Init target streamer lazily on the first function so that previous passes |
475 | // can set metadata. |
476 | if (!IsTargetStreamerInitialized) |
477 | initTargetStreamer(M&: *MF.getFunction().getParent()); |
478 | |
479 | ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>(); |
480 | CurrentProgramInfo = SIProgramInfo(); |
481 | |
482 | const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); |
483 | |
484 | // The starting address of all shader programs must be 256 bytes aligned. |
485 | // Regular functions just need the basic required instruction alignment. |
486 | MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4)); |
487 | |
488 | SetupMachineFunction(MF); |
489 | |
490 | const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); |
491 | MCContext &Context = getObjFileLowering().getContext(); |
492 | // FIXME: This should be an explicit check for Mesa. |
493 | if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) { |
494 | MCSectionELF *ConfigSection = |
495 | Context.getELFSection(Section: ".AMDGPU.config" , Type: ELF::SHT_PROGBITS, Flags: 0); |
496 | OutStreamer->switchSection(Section: ConfigSection); |
497 | } |
498 | |
499 | if (MFI->isModuleEntryFunction()) { |
500 | getSIProgramInfo(Out&: CurrentProgramInfo, MF); |
501 | } |
502 | |
503 | if (STM.isAmdPalOS()) { |
504 | if (MFI->isEntryFunction()) |
505 | EmitPALMetadata(MF, KernelInfo: CurrentProgramInfo); |
506 | else if (MFI->isModuleEntryFunction()) |
507 | emitPALFunctionMetadata(MF); |
508 | } else if (!STM.isAmdHsaOS()) { |
509 | EmitProgramInfoSI(MF, KernelInfo: CurrentProgramInfo); |
510 | } |
511 | |
512 | DumpCodeInstEmitter = nullptr; |
513 | if (STM.dumpCode()) { |
514 | // For -dumpcode, get the assembler out of the streamer, even if it does |
515 | // not really want to let us have it. This only works with -filetype=obj. |
516 | bool SaveFlag = OutStreamer->getUseAssemblerInfoForParsing(); |
517 | OutStreamer->setUseAssemblerInfoForParsing(true); |
518 | MCAssembler *Assembler = OutStreamer->getAssemblerPtr(); |
519 | OutStreamer->setUseAssemblerInfoForParsing(SaveFlag); |
520 | if (Assembler) |
521 | DumpCodeInstEmitter = Assembler->getEmitterPtr(); |
522 | } |
523 | |
524 | DisasmLines.clear(); |
525 | HexLines.clear(); |
526 | DisasmLineMaxLen = 0; |
527 | |
528 | emitFunctionBody(); |
529 | |
530 | emitResourceUsageRemarks(MF, CurrentProgramInfo, isModuleEntryFunction: MFI->isModuleEntryFunction(), |
531 | hasMAIInsts: STM.hasMAIInsts()); |
532 | |
533 | if (isVerbose()) { |
534 | MCSectionELF * = |
535 | Context.getELFSection(Section: ".AMDGPU.csdata" , Type: ELF::SHT_PROGBITS, Flags: 0); |
536 | OutStreamer->switchSection(Section: CommentSection); |
537 | |
538 | if (!MFI->isEntryFunction()) { |
539 | OutStreamer->emitRawComment(T: " Function info:" , TabPrefix: false); |
540 | const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info = |
541 | ResourceUsage->getResourceInfo(F: &MF.getFunction()); |
542 | emitCommonFunctionComments( |
543 | NumVGPR: Info.NumVGPR, |
544 | NumAGPR: STM.hasMAIInsts() ? Info.NumAGPR : std::optional<uint32_t>(), |
545 | TotalNumVGPR: Info.getTotalNumVGPRs(ST: STM), |
546 | NumSGPR: Info.getTotalNumSGPRs(ST: MF.getSubtarget<GCNSubtarget>()), |
547 | ScratchSize: Info.PrivateSegmentSize, CodeSize: getFunctionCodeSize(MF), MFI); |
548 | return false; |
549 | } |
550 | |
551 | OutStreamer->emitRawComment(T: " Kernel info:" , TabPrefix: false); |
552 | emitCommonFunctionComments( |
553 | NumVGPR: CurrentProgramInfo.NumArchVGPR, |
554 | NumAGPR: STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR |
555 | : std::optional<uint32_t>(), |
556 | TotalNumVGPR: CurrentProgramInfo.NumVGPR, NumSGPR: CurrentProgramInfo.NumSGPR, |
557 | ScratchSize: CurrentProgramInfo.ScratchSize, CodeSize: getFunctionCodeSize(MF), MFI); |
558 | |
559 | OutStreamer->emitRawComment( |
560 | T: " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), TabPrefix: false); |
561 | OutStreamer->emitRawComment( |
562 | T: " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), TabPrefix: false); |
563 | OutStreamer->emitRawComment( |
564 | T: " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) + |
565 | " bytes/workgroup (compile time only)" , TabPrefix: false); |
566 | |
567 | OutStreamer->emitRawComment( |
568 | T: " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), TabPrefix: false); |
569 | OutStreamer->emitRawComment( |
570 | T: " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), TabPrefix: false); |
571 | |
572 | OutStreamer->emitRawComment( |
573 | T: " NumSGPRsForWavesPerEU: " + |
574 | Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), TabPrefix: false); |
575 | OutStreamer->emitRawComment( |
576 | T: " NumVGPRsForWavesPerEU: " + |
577 | Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), TabPrefix: false); |
578 | |
579 | if (STM.hasGFX90AInsts()) |
580 | OutStreamer->emitRawComment( |
581 | T: " AccumOffset: " + |
582 | Twine((CurrentProgramInfo.AccumOffset + 1) * 4), TabPrefix: false); |
583 | |
584 | OutStreamer->emitRawComment( |
585 | T: " Occupancy: " + |
586 | Twine(CurrentProgramInfo.Occupancy), TabPrefix: false); |
587 | |
588 | OutStreamer->emitRawComment( |
589 | T: " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), TabPrefix: false); |
590 | |
591 | OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:SCRATCH_EN: " + |
592 | Twine(CurrentProgramInfo.ScratchEnable), |
593 | TabPrefix: false); |
594 | OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:USER_SGPR: " + |
595 | Twine(CurrentProgramInfo.UserSGPR), |
596 | TabPrefix: false); |
597 | OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " + |
598 | Twine(CurrentProgramInfo.TrapHandlerEnable), |
599 | TabPrefix: false); |
600 | OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:TGID_X_EN: " + |
601 | Twine(CurrentProgramInfo.TGIdXEnable), |
602 | TabPrefix: false); |
603 | OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:TGID_Y_EN: " + |
604 | Twine(CurrentProgramInfo.TGIdYEnable), |
605 | TabPrefix: false); |
606 | OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:TGID_Z_EN: " + |
607 | Twine(CurrentProgramInfo.TGIdZEnable), |
608 | TabPrefix: false); |
609 | OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " + |
610 | Twine(CurrentProgramInfo.TIdIGCompCount), |
611 | TabPrefix: false); |
612 | |
613 | assert(STM.hasGFX90AInsts() || |
614 | CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0); |
615 | if (STM.hasGFX90AInsts()) { |
616 | OutStreamer->emitRawComment( |
617 | T: " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " + |
618 | Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A, |
619 | amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))), |
620 | TabPrefix: false); |
621 | OutStreamer->emitRawComment( |
622 | T: " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " + |
623 | Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A, |
624 | amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))), |
625 | TabPrefix: false); |
626 | } |
627 | } |
628 | |
629 | if (DumpCodeInstEmitter) { |
630 | |
631 | OutStreamer->switchSection( |
632 | Section: Context.getELFSection(Section: ".AMDGPU.disasm" , Type: ELF::SHT_PROGBITS, Flags: 0)); |
633 | |
634 | for (size_t i = 0; i < DisasmLines.size(); ++i) { |
635 | std::string = "\n" ; |
636 | if (!HexLines[i].empty()) { |
637 | Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' '); |
638 | Comment += " ; " + HexLines[i] + "\n" ; |
639 | } |
640 | |
641 | OutStreamer->emitBytes(Data: StringRef(DisasmLines[i])); |
642 | OutStreamer->emitBytes(Data: StringRef(Comment)); |
643 | } |
644 | } |
645 | |
646 | return false; |
647 | } |
648 | |
649 | // TODO: Fold this into emitFunctionBodyStart. |
650 | void AMDGPUAsmPrinter::initializeTargetID(const Module &M) { |
651 | // In the beginning all features are either 'Any' or 'NotSupported', |
652 | // depending on global target features. This will cover empty modules. |
653 | getTargetStreamer()->initializeTargetID(STI: *getGlobalSTI(), |
654 | FeatureString: getGlobalSTI()->getFeatureString()); |
655 | |
656 | // If module is empty, we are done. |
657 | if (M.empty()) |
658 | return; |
659 | |
660 | // If module is not empty, need to find first 'Off' or 'On' feature |
661 | // setting per feature from functions in module. |
662 | for (auto &F : M) { |
663 | auto &TSTargetID = getTargetStreamer()->getTargetID(); |
664 | if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) && |
665 | (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff())) |
666 | break; |
667 | |
668 | const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F); |
669 | const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID(); |
670 | if (TSTargetID->isXnackSupported()) |
671 | if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any) |
672 | TSTargetID->setXnackSetting(STMTargetID.getXnackSetting()); |
673 | if (TSTargetID->isSramEccSupported()) |
674 | if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any) |
675 | TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting()); |
676 | } |
677 | } |
678 | |
679 | uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const { |
680 | const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); |
681 | const SIInstrInfo *TII = STM.getInstrInfo(); |
682 | |
683 | uint64_t CodeSize = 0; |
684 | |
685 | for (const MachineBasicBlock &MBB : MF) { |
686 | for (const MachineInstr &MI : MBB) { |
687 | // TODO: CodeSize should account for multiple functions. |
688 | |
689 | // TODO: Should we count size of debug info? |
690 | if (MI.isDebugInstr()) |
691 | continue; |
692 | |
693 | CodeSize += TII->getInstSizeInBytes(MI); |
694 | } |
695 | } |
696 | |
697 | return CodeSize; |
698 | } |
699 | |
700 | void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, |
701 | const MachineFunction &MF) { |
702 | const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info = |
703 | ResourceUsage->getResourceInfo(F: &MF.getFunction()); |
704 | const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); |
705 | |
706 | ProgInfo.NumArchVGPR = Info.NumVGPR; |
707 | ProgInfo.NumAccVGPR = Info.NumAGPR; |
708 | ProgInfo.NumVGPR = Info.getTotalNumVGPRs(ST: STM); |
709 | ProgInfo.AccumOffset = alignTo(Value: std::max(a: 1, b: Info.NumVGPR), Align: 4) / 4 - 1; |
710 | ProgInfo.TgSplit = STM.isTgSplitEnabled(); |
711 | ProgInfo.NumSGPR = Info.NumExplicitSGPR; |
712 | ProgInfo.ScratchSize = Info.PrivateSegmentSize; |
713 | ProgInfo.VCCUsed = Info.UsesVCC; |
714 | ProgInfo.FlatUsed = Info.UsesFlatScratch; |
715 | ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion; |
716 | |
717 | const uint64_t MaxScratchPerWorkitem = |
718 | STM.getMaxWaveScratchSize() / STM.getWavefrontSize(); |
719 | if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) { |
720 | DiagnosticInfoStackSize DiagStackSize(MF.getFunction(), |
721 | ProgInfo.ScratchSize, |
722 | MaxScratchPerWorkitem, DS_Error); |
723 | MF.getFunction().getContext().diagnose(DI: DiagStackSize); |
724 | } |
725 | |
726 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
727 | |
728 | // The calculations related to SGPR/VGPR blocks are |
729 | // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be |
730 | // unified. |
731 | unsigned = IsaInfo::getNumExtraSGPRs( |
732 | &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed, |
733 | getTargetStreamer()->getTargetID()->isXnackOnOrAny()); |
734 | |
735 | // Check the addressable register limit before we add ExtraSGPRs. |
736 | if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && |
737 | !STM.hasSGPRInitBug()) { |
738 | unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); |
739 | if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) { |
740 | // This can happen due to a compiler bug or when using inline asm. |
741 | LLVMContext &Ctx = MF.getFunction().getContext(); |
742 | DiagnosticInfoResourceLimit Diag( |
743 | MF.getFunction(), "addressable scalar registers" , ProgInfo.NumSGPR, |
744 | MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit); |
745 | Ctx.diagnose(DI: Diag); |
746 | ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1; |
747 | } |
748 | } |
749 | |
750 | // Account for extra SGPRs and VGPRs reserved for debugger use. |
751 | ProgInfo.NumSGPR += ExtraSGPRs; |
752 | |
753 | const Function &F = MF.getFunction(); |
754 | |
755 | // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave |
756 | // dispatch registers are function args. |
757 | unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0; |
758 | |
759 | if (isShader(CC: F.getCallingConv())) { |
760 | bool IsPixelShader = |
761 | F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS(); |
762 | |
763 | // Calculate the number of VGPR registers based on the SPI input registers |
764 | uint32_t InputEna = 0; |
765 | uint32_t InputAddr = 0; |
766 | unsigned LastEna = 0; |
767 | |
768 | if (IsPixelShader) { |
769 | // Note for IsPixelShader: |
770 | // By this stage, all enabled inputs are tagged in InputAddr as well. |
771 | // We will use InputAddr to determine whether the input counts against the |
772 | // vgpr total and only use the InputEnable to determine the last input |
773 | // that is relevant - if extra arguments are used, then we have to honour |
774 | // the InputAddr for any intermediate non-enabled inputs. |
775 | InputEna = MFI->getPSInputEnable(); |
776 | InputAddr = MFI->getPSInputAddr(); |
777 | |
778 | // We only need to consider input args up to the last used arg. |
779 | assert((InputEna || InputAddr) && |
780 | "PSInputAddr and PSInputEnable should " |
781 | "never both be 0 for AMDGPU_PS shaders" ); |
782 | // There are some rare circumstances where InputAddr is non-zero and |
783 | // InputEna can be set to 0. In this case we default to setting LastEna |
784 | // to 1. |
785 | LastEna = InputEna ? llvm::Log2_32(Value: InputEna) + 1 : 1; |
786 | } |
787 | |
788 | // FIXME: We should be using the number of registers determined during |
789 | // calling convention lowering to legalize the types. |
790 | const DataLayout &DL = F.getParent()->getDataLayout(); |
791 | unsigned PSArgCount = 0; |
792 | unsigned IntermediateVGPR = 0; |
793 | for (auto &Arg : F.args()) { |
794 | unsigned NumRegs = (DL.getTypeSizeInBits(Ty: Arg.getType()) + 31) / 32; |
795 | if (Arg.hasAttribute(Attribute::Kind: InReg)) { |
796 | WaveDispatchNumSGPR += NumRegs; |
797 | } else { |
798 | // If this is a PS shader and we're processing the PS Input args (first |
799 | // 16 VGPR), use the InputEna and InputAddr bits to define how many |
800 | // VGPRs are actually used. |
801 | // Any extra VGPR arguments are handled as normal arguments (and |
802 | // contribute to the VGPR count whether they're used or not). |
803 | if (IsPixelShader && PSArgCount < 16) { |
804 | if ((1 << PSArgCount) & InputAddr) { |
805 | if (PSArgCount < LastEna) |
806 | WaveDispatchNumVGPR += NumRegs; |
807 | else |
808 | IntermediateVGPR += NumRegs; |
809 | } |
810 | PSArgCount++; |
811 | } else { |
812 | // If there are extra arguments we have to include the allocation for |
813 | // the non-used (but enabled with InputAddr) input arguments |
814 | if (IntermediateVGPR) { |
815 | WaveDispatchNumVGPR += IntermediateVGPR; |
816 | IntermediateVGPR = 0; |
817 | } |
818 | WaveDispatchNumVGPR += NumRegs; |
819 | } |
820 | } |
821 | } |
822 | ProgInfo.NumSGPR = std::max(a: ProgInfo.NumSGPR, b: WaveDispatchNumSGPR); |
823 | ProgInfo.NumArchVGPR = std::max(a: ProgInfo.NumVGPR, b: WaveDispatchNumVGPR); |
824 | ProgInfo.NumVGPR = |
825 | Info.getTotalNumVGPRs(ST: STM, NumAGPR: Info.NumAGPR, NumVGPR: ProgInfo.NumArchVGPR); |
826 | } |
827 | |
828 | // Adjust number of registers used to meet default/requested minimum/maximum |
829 | // number of waves per execution unit request. |
830 | ProgInfo.NumSGPRsForWavesPerEU = std::max( |
831 | a: std::max(a: ProgInfo.NumSGPR, b: 1u), b: STM.getMinNumSGPRs(WavesPerEU: MFI->getMaxWavesPerEU())); |
832 | ProgInfo.NumVGPRsForWavesPerEU = std::max( |
833 | a: std::max(a: ProgInfo.NumVGPR, b: 1u), b: STM.getMinNumVGPRs(WavesPerEU: MFI->getMaxWavesPerEU())); |
834 | |
835 | if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS || |
836 | STM.hasSGPRInitBug()) { |
837 | unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); |
838 | if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) { |
839 | // This can happen due to a compiler bug or when using inline asm to use |
840 | // the registers which are usually reserved for vcc etc. |
841 | LLVMContext &Ctx = MF.getFunction().getContext(); |
842 | DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers" , |
843 | ProgInfo.NumSGPR, MaxAddressableNumSGPRs, |
844 | DS_Error, DK_ResourceLimit); |
845 | Ctx.diagnose(DI: Diag); |
846 | ProgInfo.NumSGPR = MaxAddressableNumSGPRs; |
847 | ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs; |
848 | } |
849 | } |
850 | |
851 | if (STM.hasSGPRInitBug()) { |
852 | ProgInfo.NumSGPR = |
853 | AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; |
854 | ProgInfo.NumSGPRsForWavesPerEU = |
855 | AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; |
856 | } |
857 | |
858 | if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) { |
859 | LLVMContext &Ctx = MF.getFunction().getContext(); |
860 | DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs" , |
861 | MFI->getNumUserSGPRs(), |
862 | STM.getMaxNumUserSGPRs(), DS_Error); |
863 | Ctx.diagnose(DI: Diag); |
864 | } |
865 | |
866 | if (MFI->getLDSSize() > |
867 | static_cast<unsigned>(STM.getAddressableLocalMemorySize())) { |
868 | LLVMContext &Ctx = MF.getFunction().getContext(); |
869 | DiagnosticInfoResourceLimit Diag( |
870 | MF.getFunction(), "local memory" , MFI->getLDSSize(), |
871 | STM.getAddressableLocalMemorySize(), DS_Error); |
872 | Ctx.diagnose(DI: Diag); |
873 | } |
874 | |
875 | ProgInfo.SGPRBlocks = IsaInfo::getNumSGPRBlocks( |
876 | &STM, ProgInfo.NumSGPRsForWavesPerEU); |
877 | ProgInfo.VGPRBlocks = |
878 | IsaInfo::getEncodedNumVGPRBlocks(&STM, ProgInfo.NumVGPRsForWavesPerEU); |
879 | |
880 | const SIModeRegisterDefaults Mode = MFI->getMode(); |
881 | |
882 | // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode |
883 | // register. |
884 | ProgInfo.FloatMode = getFPMode(Mode); |
885 | |
886 | ProgInfo.IEEEMode = Mode.IEEE; |
887 | |
888 | // Make clamp modifier on NaN input returns 0. |
889 | ProgInfo.DX10Clamp = Mode.DX10Clamp; |
890 | |
891 | unsigned LDSAlignShift; |
892 | if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { |
893 | // LDS is allocated in 64 dword blocks. |
894 | LDSAlignShift = 8; |
895 | } else { |
896 | // LDS is allocated in 128 dword blocks. |
897 | LDSAlignShift = 9; |
898 | } |
899 | |
900 | ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs(); |
901 | ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs(); |
902 | |
903 | ProgInfo.LDSSize = MFI->getLDSSize(); |
904 | ProgInfo.LDSBlocks = |
905 | alignTo(Value: ProgInfo.LDSSize, Align: 1ULL << LDSAlignShift) >> LDSAlignShift; |
906 | |
907 | // Scratch is allocated in 64-dword or 256-dword blocks. |
908 | unsigned ScratchAlignShift = |
909 | STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10; |
910 | // We need to program the hardware with the amount of scratch memory that |
911 | // is used by the entire wave. ProgInfo.ScratchSize is the amount of |
912 | // scratch memory used per thread. |
913 | ProgInfo.ScratchBlocks = divideCeil( |
914 | ProgInfo.ScratchSize * STM.getWavefrontSize(), 1ULL << ScratchAlignShift); |
915 | |
916 | if (getIsaVersion(GPU: getGlobalSTI()->getCPU()).Major >= 10) { |
917 | ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1; |
918 | ProgInfo.MemOrdered = 1; |
919 | } |
920 | |
921 | // 0 = X, 1 = XY, 2 = XYZ |
922 | unsigned TIDIGCompCnt = 0; |
923 | if (MFI->hasWorkItemIDZ()) |
924 | TIDIGCompCnt = 2; |
925 | else if (MFI->hasWorkItemIDY()) |
926 | TIDIGCompCnt = 1; |
927 | |
928 | // The private segment wave byte offset is the last of the system SGPRs. We |
929 | // initially assumed it was allocated, and may have used it. It shouldn't harm |
930 | // anything to disable it if we know the stack isn't used here. We may still |
931 | // have emitted code reading it to initialize scratch, but if that's unused |
932 | // reading garbage should be OK. |
933 | ProgInfo.ScratchEnable = |
934 | ProgInfo.ScratchBlocks > 0 || ProgInfo.DynamicCallStack; |
935 | ProgInfo.UserSGPR = MFI->getNumUserSGPRs(); |
936 | // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP. |
937 | ProgInfo.TrapHandlerEnable = |
938 | STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled(); |
939 | ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX(); |
940 | ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY(); |
941 | ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ(); |
942 | ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo(); |
943 | ProgInfo.TIdIGCompCount = TIDIGCompCnt; |
944 | ProgInfo.EXCPEnMSB = 0; |
945 | // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP. |
946 | ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks; |
947 | ProgInfo.EXCPEnable = 0; |
948 | |
949 | if (STM.hasGFX90AInsts()) { |
950 | AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A, |
951 | amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, |
952 | ProgInfo.AccumOffset); |
953 | AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A, |
954 | amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, |
955 | ProgInfo.TgSplit); |
956 | } |
957 | |
958 | ProgInfo.Occupancy = STM.computeOccupancy(F: MF.getFunction(), LDSSize: ProgInfo.LDSSize, |
959 | NumSGPRs: ProgInfo.NumSGPRsForWavesPerEU, |
960 | NumVGPRs: ProgInfo.NumVGPRsForWavesPerEU); |
961 | const auto [MinWEU, MaxWEU] = |
962 | AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-waves-per-eu" , Default: {0, 0}, OnlyFirstRequired: true); |
963 | if (ProgInfo.Occupancy < MinWEU) { |
964 | DiagnosticInfoOptimizationFailure Diag( |
965 | F, F.getSubprogram(), |
966 | "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in " |
967 | "'" + |
968 | F.getName() + "': desired occupancy was " + Twine(MinWEU) + |
969 | ", final occupancy is " + Twine(ProgInfo.Occupancy)); |
970 | F.getContext().diagnose(DI: Diag); |
971 | } |
972 | } |
973 | |
974 | static unsigned getRsrcReg(CallingConv::ID CallConv) { |
975 | switch (CallConv) { |
976 | default: [[fallthrough]]; |
977 | case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1; |
978 | case CallingConv::AMDGPU_LS: return R_00B528_SPI_SHADER_PGM_RSRC1_LS; |
979 | case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS; |
980 | case CallingConv::AMDGPU_ES: return R_00B328_SPI_SHADER_PGM_RSRC1_ES; |
981 | case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS; |
982 | case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS; |
983 | case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS; |
984 | } |
985 | } |
986 | |
987 | void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, |
988 | const SIProgramInfo &CurrentProgramInfo) { |
989 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
990 | const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); |
991 | unsigned RsrcReg = getRsrcReg(CallConv: MF.getFunction().getCallingConv()); |
992 | |
993 | if (AMDGPU::isCompute(CC: MF.getFunction().getCallingConv())) { |
994 | OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1); |
995 | |
996 | OutStreamer->emitInt32(Value: CurrentProgramInfo.getComputePGMRSrc1(ST: STM)); |
997 | |
998 | OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2); |
999 | OutStreamer->emitInt32(Value: CurrentProgramInfo.getComputePGMRSrc2()); |
1000 | |
1001 | OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE); |
1002 | OutStreamer->emitInt32( |
1003 | Value: STM.getGeneration() >= AMDGPUSubtarget::GFX12 |
1004 | ? S_00B860_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks) |
1005 | : STM.getGeneration() == AMDGPUSubtarget::GFX11 |
1006 | ? S_00B860_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks) |
1007 | : S_00B860_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks)); |
1008 | |
1009 | // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = |
1010 | // 0" comment but I don't see a corresponding field in the register spec. |
1011 | } else { |
1012 | OutStreamer->emitInt32(Value: RsrcReg); |
1013 | OutStreamer->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) | |
1014 | S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), Size: 4); |
1015 | OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE); |
1016 | OutStreamer->emitInt32( |
1017 | Value: STM.getGeneration() >= AMDGPUSubtarget::GFX12 |
1018 | ? S_0286E8_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks) |
1019 | : STM.getGeneration() == AMDGPUSubtarget::GFX11 |
1020 | ? S_0286E8_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks) |
1021 | : S_0286E8_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks)); |
1022 | } |
1023 | |
1024 | if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { |
1025 | OutStreamer->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS); |
1026 | unsigned = STM.getGeneration() >= AMDGPUSubtarget::GFX11 |
1027 | ? divideCeil(Numerator: CurrentProgramInfo.LDSBlocks, Denominator: 2) |
1028 | : CurrentProgramInfo.LDSBlocks; |
1029 | OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize)); |
1030 | OutStreamer->emitInt32(R_0286CC_SPI_PS_INPUT_ENA); |
1031 | OutStreamer->emitInt32(Value: MFI->getPSInputEnable()); |
1032 | OutStreamer->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR); |
1033 | OutStreamer->emitInt32(Value: MFI->getPSInputAddr()); |
1034 | } |
1035 | |
1036 | OutStreamer->emitInt32(R_SPILLED_SGPRS); |
1037 | OutStreamer->emitInt32(Value: MFI->getNumSpilledSGPRs()); |
1038 | OutStreamer->emitInt32(R_SPILLED_VGPRS); |
1039 | OutStreamer->emitInt32(Value: MFI->getNumSpilledVGPRs()); |
1040 | } |
1041 | |
1042 | // Helper function to add common PAL Metadata 3.0+ |
1043 | static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD, |
1044 | const SIProgramInfo &CurrentProgramInfo, |
1045 | CallingConv::ID CC, const GCNSubtarget &ST) { |
1046 | if (ST.hasIEEEMode()) |
1047 | MD->setHwStage(CC, field: ".ieee_mode" , Val: (bool)CurrentProgramInfo.IEEEMode); |
1048 | |
1049 | MD->setHwStage(CC, field: ".wgp_mode" , Val: (bool)CurrentProgramInfo.WgpMode); |
1050 | MD->setHwStage(CC, field: ".mem_ordered" , Val: (bool)CurrentProgramInfo.MemOrdered); |
1051 | |
1052 | if (AMDGPU::isCompute(CC)) { |
1053 | MD->setHwStage(CC, field: ".trap_present" , |
1054 | Val: (bool)CurrentProgramInfo.TrapHandlerEnable); |
1055 | MD->setHwStage(CC, field: ".excp_en" , Val: CurrentProgramInfo.EXCPEnable); |
1056 | |
1057 | MD->setHwStage(CC, field: ".lds_size" , |
1058 | Val: (unsigned)(CurrentProgramInfo.LdsSize * |
1059 | getLdsDwGranularity(ST) * sizeof(uint32_t))); |
1060 | } |
1061 | } |
1062 | |
1063 | // This is the equivalent of EmitProgramInfoSI above, but for when the OS type |
1064 | // is AMDPAL. It stores each compute/SPI register setting and other PAL |
1065 | // metadata items into the PALMD::Metadata, combining with any provided by the |
1066 | // frontend as LLVM metadata. Once all functions are written, the PAL metadata |
1067 | // is then written as a single block in the .note section. |
1068 | void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF, |
1069 | const SIProgramInfo &CurrentProgramInfo) { |
1070 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
1071 | auto CC = MF.getFunction().getCallingConv(); |
1072 | auto MD = getTargetStreamer()->getPALMetadata(); |
1073 | |
1074 | MD->setEntryPoint(CC, Name: MF.getFunction().getName()); |
1075 | MD->setNumUsedVgprs(CC, Val: CurrentProgramInfo.NumVGPRsForWavesPerEU); |
1076 | |
1077 | // Only set AGPRs for supported devices |
1078 | const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); |
1079 | if (STM.hasMAIInsts()) { |
1080 | MD->setNumUsedAgprs(CC, Val: CurrentProgramInfo.NumAccVGPR); |
1081 | } |
1082 | |
1083 | MD->setNumUsedSgprs(CC, Val: CurrentProgramInfo.NumSGPRsForWavesPerEU); |
1084 | if (MD->getPALMajorVersion() < 3) { |
1085 | MD->setRsrc1(CC, Val: CurrentProgramInfo.getPGMRSrc1(CC, ST: STM)); |
1086 | if (AMDGPU::isCompute(CC)) { |
1087 | MD->setRsrc2(CC, Val: CurrentProgramInfo.getComputePGMRSrc2()); |
1088 | } else { |
1089 | if (CurrentProgramInfo.ScratchBlocks > 0) |
1090 | MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1)); |
1091 | } |
1092 | } else { |
1093 | MD->setHwStage(CC, field: ".debug_mode" , Val: (bool)CurrentProgramInfo.DebugMode); |
1094 | MD->setHwStage(CC, field: ".scratch_en" , Val: (bool)CurrentProgramInfo.ScratchEnable); |
1095 | EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, ST: STM); |
1096 | } |
1097 | |
1098 | // ScratchSize is in bytes, 16 aligned. |
1099 | MD->setScratchSize(CC, Val: alignTo(Value: CurrentProgramInfo.ScratchSize, Align: 16)); |
1100 | if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { |
1101 | unsigned = STM.getGeneration() >= AMDGPUSubtarget::GFX11 |
1102 | ? divideCeil(Numerator: CurrentProgramInfo.LDSBlocks, Denominator: 2) |
1103 | : CurrentProgramInfo.LDSBlocks; |
1104 | if (MD->getPALMajorVersion() < 3) { |
1105 | MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize)); |
1106 | MD->setSpiPsInputEna(MFI->getPSInputEnable()); |
1107 | MD->setSpiPsInputAddr(MFI->getPSInputAddr()); |
1108 | } else { |
1109 | // Graphics registers |
1110 | const unsigned = |
1111 | STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128; |
1112 | MD->setGraphicsRegisters( |
1113 | field: ".ps_extra_lds_size" , |
1114 | Val: (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t))); |
1115 | |
1116 | // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr |
1117 | static StringLiteral const PsInputFields[] = { |
1118 | ".persp_sample_ena" , ".persp_center_ena" , |
1119 | ".persp_centroid_ena" , ".persp_pull_model_ena" , |
1120 | ".linear_sample_ena" , ".linear_center_ena" , |
1121 | ".linear_centroid_ena" , ".line_stipple_tex_ena" , |
1122 | ".pos_x_float_ena" , ".pos_y_float_ena" , |
1123 | ".pos_z_float_ena" , ".pos_w_float_ena" , |
1124 | ".front_face_ena" , ".ancillary_ena" , |
1125 | ".sample_coverage_ena" , ".pos_fixed_pt_ena" }; |
1126 | unsigned PSInputEna = MFI->getPSInputEnable(); |
1127 | unsigned PSInputAddr = MFI->getPSInputAddr(); |
1128 | for (auto [Idx, Field] : enumerate(First: PsInputFields)) { |
1129 | MD->setGraphicsRegisters(field1: ".spi_ps_input_ena" , field2: Field, |
1130 | Val: (bool)((PSInputEna >> Idx) & 1)); |
1131 | MD->setGraphicsRegisters(field1: ".spi_ps_input_addr" , field2: Field, |
1132 | Val: (bool)((PSInputAddr >> Idx) & 1)); |
1133 | } |
1134 | } |
1135 | } |
1136 | |
1137 | // For version 3 and above the wave front size is already set in the metadata |
1138 | if (MD->getPALMajorVersion() < 3 && STM.isWave32()) |
1139 | MD->setWave32(MF.getFunction().getCallingConv()); |
1140 | } |
1141 | |
1142 | void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) { |
1143 | auto *MD = getTargetStreamer()->getPALMetadata(); |
1144 | const MachineFrameInfo &MFI = MF.getFrameInfo(); |
1145 | StringRef FnName = MF.getFunction().getName(); |
1146 | MD->setFunctionScratchSize(FnName, Val: MFI.getStackSize()); |
1147 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1148 | |
1149 | if (MD->getPALMajorVersion() < 3) { |
1150 | // Set compute registers |
1151 | MD->setRsrc1(CC: CallingConv::AMDGPU_CS, |
1152 | Val: CurrentProgramInfo.getPGMRSrc1(CC: CallingConv::AMDGPU_CS, ST)); |
1153 | MD->setRsrc2(CC: CallingConv::AMDGPU_CS, |
1154 | Val: CurrentProgramInfo.getComputePGMRSrc2()); |
1155 | } else { |
1156 | EmitPALMetadataCommon(MD, CurrentProgramInfo, CC: CallingConv::AMDGPU_CS, ST); |
1157 | } |
1158 | |
1159 | // Set optional info |
1160 | MD->setFunctionLdsSize(FnName, Val: CurrentProgramInfo.LDSSize); |
1161 | MD->setFunctionNumUsedVgprs(FnName, Val: CurrentProgramInfo.NumVGPRsForWavesPerEU); |
1162 | MD->setFunctionNumUsedSgprs(FnName, Val: CurrentProgramInfo.NumSGPRsForWavesPerEU); |
1163 | } |
1164 | |
1165 | // This is supposed to be log2(Size) |
1166 | static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) { |
1167 | switch (Size) { |
1168 | case 4: |
1169 | return AMD_ELEMENT_4_BYTES; |
1170 | case 8: |
1171 | return AMD_ELEMENT_8_BYTES; |
1172 | case 16: |
1173 | return AMD_ELEMENT_16_BYTES; |
1174 | default: |
1175 | llvm_unreachable("invalid private_element_size" ); |
1176 | } |
1177 | } |
1178 | |
1179 | void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, |
1180 | const SIProgramInfo &CurrentProgramInfo, |
1181 | const MachineFunction &MF) const { |
1182 | const Function &F = MF.getFunction(); |
1183 | assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || |
1184 | F.getCallingConv() == CallingConv::SPIR_KERNEL); |
1185 | |
1186 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
1187 | const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); |
1188 | |
1189 | AMDGPU::initDefaultAMDKernelCodeT(Out, &STM); |
1190 | |
1191 | Out.compute_pgm_resource_registers = |
1192 | CurrentProgramInfo.getComputePGMRSrc1(ST: STM) | |
1193 | (CurrentProgramInfo.getComputePGMRSrc2() << 32); |
1194 | Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64; |
1195 | |
1196 | if (CurrentProgramInfo.DynamicCallStack) |
1197 | Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK; |
1198 | |
1199 | AMD_HSA_BITS_SET(Out.code_properties, |
1200 | AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE, |
1201 | getElementByteSizeValue(STM.getMaxPrivateElementSize(true))); |
1202 | |
1203 | const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo(); |
1204 | if (UserSGPRInfo.hasPrivateSegmentBuffer()) { |
1205 | Out.code_properties |= |
1206 | AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; |
1207 | } |
1208 | |
1209 | if (UserSGPRInfo.hasDispatchPtr()) |
1210 | Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; |
1211 | |
1212 | if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) |
1213 | Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; |
1214 | |
1215 | if (UserSGPRInfo.hasKernargSegmentPtr()) |
1216 | Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; |
1217 | |
1218 | if (UserSGPRInfo.hasDispatchID()) |
1219 | Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; |
1220 | |
1221 | if (UserSGPRInfo.hasFlatScratchInit()) |
1222 | Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; |
1223 | |
1224 | if (UserSGPRInfo.hasDispatchPtr()) |
1225 | Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; |
1226 | |
1227 | if (STM.isXNACKEnabled()) |
1228 | Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; |
1229 | |
1230 | Align MaxKernArgAlign; |
1231 | Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign); |
1232 | Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR; |
1233 | Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR; |
1234 | Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize; |
1235 | Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize; |
1236 | |
1237 | // kernarg_segment_alignment is specified as log of the alignment. |
1238 | // The minimum alignment is 16. |
1239 | // FIXME: The metadata treats the minimum as 4? |
1240 | Out.kernarg_segment_alignment = Log2(A: std::max(a: Align(16), b: MaxKernArgAlign)); |
1241 | } |
1242 | |
1243 | bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, |
1244 | const char *, raw_ostream &O) { |
1245 | // First try the generic code, which knows about modifiers like 'c' and 'n'. |
1246 | if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, OS&: O)) |
1247 | return false; |
1248 | |
1249 | if (ExtraCode && ExtraCode[0]) { |
1250 | if (ExtraCode[1] != 0) |
1251 | return true; // Unknown modifier. |
1252 | |
1253 | switch (ExtraCode[0]) { |
1254 | case 'r': |
1255 | break; |
1256 | default: |
1257 | return true; |
1258 | } |
1259 | } |
1260 | |
1261 | // TODO: Should be able to support other operand types like globals. |
1262 | const MachineOperand &MO = MI->getOperand(i: OpNo); |
1263 | if (MO.isReg()) { |
1264 | AMDGPUInstPrinter::printRegOperand(RegNo: MO.getReg(), O, |
1265 | MRI: *MF->getSubtarget().getRegisterInfo()); |
1266 | return false; |
1267 | } else if (MO.isImm()) { |
1268 | int64_t Val = MO.getImm(); |
1269 | if (AMDGPU::isInlinableIntLiteral(Literal: Val)) { |
1270 | O << Val; |
1271 | } else if (isUInt<16>(x: Val)) { |
1272 | O << format(Fmt: "0x%" PRIx16, Vals: static_cast<uint16_t>(Val)); |
1273 | } else if (isUInt<32>(x: Val)) { |
1274 | O << format(Fmt: "0x%" PRIx32, Vals: static_cast<uint32_t>(Val)); |
1275 | } else { |
1276 | O << format(Fmt: "0x%" PRIx64, Vals: static_cast<uint64_t>(Val)); |
1277 | } |
1278 | return false; |
1279 | } |
1280 | return true; |
1281 | } |
1282 | |
1283 | void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const { |
1284 | AU.addRequired<AMDGPUResourceUsageAnalysis>(); |
1285 | AU.addPreserved<AMDGPUResourceUsageAnalysis>(); |
1286 | AsmPrinter::getAnalysisUsage(AU); |
1287 | } |
1288 | |
1289 | void AMDGPUAsmPrinter::( |
1290 | const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo, |
1291 | bool isModuleEntryFunction, bool hasMAIInsts) { |
1292 | if (!ORE) |
1293 | return; |
1294 | |
1295 | const char *Name = "kernel-resource-usage" ; |
1296 | const char *Indent = " " ; |
1297 | |
1298 | // If the remark is not specifically enabled, do not output to yaml |
1299 | LLVMContext &Ctx = MF.getFunction().getContext(); |
1300 | if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(PassName: Name)) |
1301 | return; |
1302 | |
1303 | auto = [&](StringRef , |
1304 | StringRef , auto Argument) { |
1305 | // Add an indent for every line besides the line with the kernel name. This |
1306 | // makes it easier to tell which resource usage go with which kernel since |
1307 | // the kernel name will always be displayed first. |
1308 | std::string LabelStr = RemarkLabel.str() + ": " ; |
1309 | if (!RemarkName.equals(RHS: "FunctionName" )) |
1310 | LabelStr = Indent + LabelStr; |
1311 | |
1312 | ORE->emit([&]() { |
1313 | return MachineOptimizationRemarkAnalysis(Name, RemarkName, |
1314 | MF.getFunction().getSubprogram(), |
1315 | &MF.front()) |
1316 | << LabelStr << ore::NV(RemarkName, Argument); |
1317 | }); |
1318 | }; |
1319 | |
1320 | // FIXME: Formatting here is pretty nasty because clang does not accept |
1321 | // newlines from diagnostics. This forces us to emit multiple diagnostic |
1322 | // remarks to simulate newlines. If and when clang does accept newlines, this |
1323 | // formatting should be aggregated into one remark with newlines to avoid |
1324 | // printing multiple diagnostic location and diag opts. |
1325 | EmitResourceUsageRemark("FunctionName" , "Function Name" , |
1326 | MF.getFunction().getName()); |
1327 | EmitResourceUsageRemark("NumSGPR" , "SGPRs" , CurrentProgramInfo.NumSGPR); |
1328 | EmitResourceUsageRemark("NumVGPR" , "VGPRs" , CurrentProgramInfo.NumArchVGPR); |
1329 | if (hasMAIInsts) |
1330 | EmitResourceUsageRemark("NumAGPR" , "AGPRs" , CurrentProgramInfo.NumAccVGPR); |
1331 | EmitResourceUsageRemark("ScratchSize" , "ScratchSize [bytes/lane]" , |
1332 | CurrentProgramInfo.ScratchSize); |
1333 | StringRef DynamicStackStr = |
1334 | CurrentProgramInfo.DynamicCallStack ? "True" : "False" ; |
1335 | EmitResourceUsageRemark("DynamicStack" , "Dynamic Stack" , DynamicStackStr); |
1336 | EmitResourceUsageRemark("Occupancy" , "Occupancy [waves/SIMD]" , |
1337 | CurrentProgramInfo.Occupancy); |
1338 | EmitResourceUsageRemark("SGPRSpill" , "SGPRs Spill" , |
1339 | CurrentProgramInfo.SGPRSpill); |
1340 | EmitResourceUsageRemark("VGPRSpill" , "VGPRs Spill" , |
1341 | CurrentProgramInfo.VGPRSpill); |
1342 | if (isModuleEntryFunction) |
1343 | EmitResourceUsageRemark("BytesLDS" , "LDS Size [bytes/block]" , |
1344 | CurrentProgramInfo.LDSSize); |
1345 | } |
1346 | |