1//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10///
11/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12/// code. When passed an MCAsmStreamer it prints assembly and when passed
13/// an MCObjectStreamer it outputs binary code.
14//
15//===----------------------------------------------------------------------===//
16//
17
18#include "AMDGPUAsmPrinter.h"
19#include "AMDGPU.h"
20#include "AMDGPUHSAMetadataStreamer.h"
21#include "AMDGPUResourceUsageAnalysis.h"
22#include "AMDKernelCodeT.h"
23#include "GCNSubtarget.h"
24#include "MCTargetDesc/AMDGPUInstPrinter.h"
25#include "MCTargetDesc/AMDGPUMCKernelDescriptor.h"
26#include "MCTargetDesc/AMDGPUTargetStreamer.h"
27#include "R600AsmPrinter.h"
28#include "SIMachineFunctionInfo.h"
29#include "TargetInfo/AMDGPUTargetInfo.h"
30#include "Utils/AMDGPUBaseInfo.h"
31#include "llvm/Analysis/OptimizationRemarkEmitter.h"
32#include "llvm/BinaryFormat/ELF.h"
33#include "llvm/CodeGen/MachineFrameInfo.h"
34#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
35#include "llvm/IR/DiagnosticInfo.h"
36#include "llvm/MC/MCAssembler.h"
37#include "llvm/MC/MCContext.h"
38#include "llvm/MC/MCSectionELF.h"
39#include "llvm/MC/MCStreamer.h"
40#include "llvm/MC/TargetRegistry.h"
41#include "llvm/Support/AMDHSAKernelDescriptor.h"
42#include "llvm/Target/TargetLoweringObjectFile.h"
43#include "llvm/Target/TargetMachine.h"
44#include "llvm/TargetParser/TargetParser.h"
45
46using namespace llvm;
47using namespace llvm::AMDGPU;
48
49// This should get the default rounding mode from the kernel. We just set the
50// default here, but this could change if the OpenCL rounding mode pragmas are
51// used.
52//
53// The denormal mode here should match what is reported by the OpenCL runtime
54// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
55// can also be override to flush with the -cl-denorms-are-zero compiler flag.
56//
57// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
58// precision, and leaves single precision to flush all and does not report
59// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
60// CL_FP_DENORM for both.
61//
62// FIXME: It seems some instructions do not support single precision denormals
63// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
64// and sin_f32, cos_f32 on most parts).
65
66// We want to use these instructions, and using fp32 denormals also causes
67// instructions to run at the double precision rate for the device so it's
68// probably best to just report no single precision denormals.
69static uint32_t getFPMode(SIModeRegisterDefaults Mode) {
70 return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
71 FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
72 FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) |
73 FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue());
74}
75
76static AsmPrinter *
77createAMDGPUAsmPrinterPass(TargetMachine &tm,
78 std::unique_ptr<MCStreamer> &&Streamer) {
79 return new AMDGPUAsmPrinter(tm, std::move(Streamer));
80}
81
82extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter() {
83 TargetRegistry::RegisterAsmPrinter(T&: getTheR600Target(),
84 Fn: llvm::createR600AsmPrinterPass);
85 TargetRegistry::RegisterAsmPrinter(T&: getTheGCNTarget(),
86 Fn: createAMDGPUAsmPrinterPass);
87}
88
89AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
90 std::unique_ptr<MCStreamer> Streamer)
91 : AsmPrinter(TM, std::move(Streamer)) {
92 assert(OutStreamer && "AsmPrinter constructed without streamer");
93}
94
95StringRef AMDGPUAsmPrinter::getPassName() const {
96 return "AMDGPU Assembly Printer";
97}
98
99const MCSubtargetInfo *AMDGPUAsmPrinter::getGlobalSTI() const {
100 return TM.getMCSubtargetInfo();
101}
102
103AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
104 if (!OutStreamer)
105 return nullptr;
106 return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
107}
108
109void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {
110 IsTargetStreamerInitialized = false;
111}
112
113void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
114 IsTargetStreamerInitialized = true;
115
116 // TODO: Which one is called first, emitStartOfAsmFile or
117 // emitFunctionBodyStart?
118 if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
119 initializeTargetID(M);
120
121 if (TM.getTargetTriple().getOS() != Triple::AMDHSA &&
122 TM.getTargetTriple().getOS() != Triple::AMDPAL)
123 return;
124
125 getTargetStreamer()->EmitDirectiveAMDGCNTarget();
126
127 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
128 getTargetStreamer()->EmitDirectiveAMDHSACodeObjectVersion(
129 COV: CodeObjectVersion);
130 HSAMetadataStream->begin(Mod: M, TargetID: *getTargetStreamer()->getTargetID());
131 }
132
133 if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
134 getTargetStreamer()->getPALMetadata()->readFromIR(M);
135}
136
137void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
138 // Init target streamer if it has not yet happened
139 if (!IsTargetStreamerInitialized)
140 initTargetStreamer(M);
141
142 if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
143 getTargetStreamer()->EmitISAVersion();
144
145 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
146 // Emit HSA Metadata (NT_AMD_HSA_METADATA).
147 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
148 HSAMetadataStream->end();
149 bool Success = HSAMetadataStream->emitTo(TargetStreamer&: *getTargetStreamer());
150 (void)Success;
151 assert(Success && "Malformed HSA Metadata");
152 }
153}
154
155void AMDGPUAsmPrinter::emitFunctionBodyStart() {
156 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
157 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
158 const Function &F = MF->getFunction();
159
160 // TODO: We're checking this late, would be nice to check it earlier.
161 if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
162 report_fatal_error(
163 STM.getCPU() + " is only available on code object version 6 or better",
164 /*gen_crash_diag*/ false);
165 }
166
167 // TODO: Which one is called first, emitStartOfAsmFile or
168 // emitFunctionBodyStart?
169 if (!getTargetStreamer()->getTargetID())
170 initializeTargetID(M: *F.getParent());
171
172 const auto &FunctionTargetID = STM.getTargetID();
173 // Make sure function's xnack settings are compatible with module's
174 // xnack settings.
175 if (FunctionTargetID.isXnackSupported() &&
176 FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
177 FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
178 OutContext.reportError(L: {}, Msg: "xnack setting of '" + Twine(MF->getName()) +
179 "' function does not match module xnack setting");
180 return;
181 }
182 // Make sure function's sramecc settings are compatible with module's
183 // sramecc settings.
184 if (FunctionTargetID.isSramEccSupported() &&
185 FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
186 FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
187 OutContext.reportError(L: {}, Msg: "sramecc setting of '" + Twine(MF->getName()) +
188 "' function does not match module sramecc setting");
189 return;
190 }
191
192 if (!MFI.isEntryFunction())
193 return;
194
195 if (STM.isMesaKernel(F) &&
196 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
197 F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
198 amd_kernel_code_t KernelCode;
199 getAmdKernelCode(Out&: KernelCode, KernelInfo: CurrentProgramInfo, MF: *MF);
200 getTargetStreamer()->EmitAMDKernelCodeT(Header: KernelCode);
201 }
202
203 if (STM.isAmdHsaOS())
204 HSAMetadataStream->emitKernel(MF: *MF, ProgramInfo: CurrentProgramInfo);
205
206 if (MFI.getNumKernargPreloadedSGPRs() > 0) {
207 assert(AMDGPU::hasKernargPreload(STM));
208 getTargetStreamer()->EmitKernargPreloadHeader(STI: *getGlobalSTI(),
209 TrapEnabled: STM.isAmdHsaOS());
210 }
211}
212
213void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
214 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
215 if (!MFI.isEntryFunction())
216 return;
217
218 if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
219 return;
220
221 auto &Streamer = getTargetStreamer()->getStreamer();
222 auto &Context = Streamer.getContext();
223 auto &ObjectFileInfo = *Context.getObjectFileInfo();
224 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
225
226 Streamer.pushSection();
227 Streamer.switchSection(Section: &ReadOnlySection);
228
229 // CP microcode requires the kernel descriptor to be allocated on 64 byte
230 // alignment.
231 Streamer.emitValueToAlignment(Alignment: Align(64), Value: 0, ValueSize: 1, MaxBytesToEmit: 0);
232 ReadOnlySection.ensureMinAlignment(MinAlignment: Align(64));
233
234 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
235
236 SmallString<128> KernelName;
237 getNameWithPrefix(Name&: KernelName, GV: &MF->getFunction());
238 getTargetStreamer()->EmitAmdhsaKernelDescriptor(
239 STM, KernelName, getAmdhsaKernelDescriptor(MF: *MF, PI: CurrentProgramInfo),
240 CurrentProgramInfo.NumVGPRsForWavesPerEU,
241 CurrentProgramInfo.NumSGPRsForWavesPerEU -
242 IsaInfo::getNumExtraSGPRs(
243 &STM, CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
244 getTargetStreamer()->getTargetID()->isXnackOnOrAny()),
245 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
246
247 Streamer.popSection();
248}
249
250void AMDGPUAsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
251 Register RegNo = MI->getOperand(i: 0).getReg();
252
253 SmallString<128> Str;
254 raw_svector_ostream OS(Str);
255 OS << "implicit-def: "
256 << printReg(Reg: RegNo, TRI: MF->getSubtarget().getRegisterInfo());
257
258 if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
259 OS << " : SGPR spill to VGPR lane";
260
261 OutStreamer->AddComment(T: OS.str());
262 OutStreamer->addBlankLine();
263}
264
265void AMDGPUAsmPrinter::emitFunctionEntryLabel() {
266 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
267 AsmPrinter::emitFunctionEntryLabel();
268 return;
269 }
270
271 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
272 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
273 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
274 SmallString<128> SymbolName;
275 getNameWithPrefix(Name&: SymbolName, GV: &MF->getFunction()),
276 getTargetStreamer()->EmitAMDGPUSymbolType(
277 SymbolName, Type: ELF::STT_AMDGPU_HSA_KERNEL);
278 }
279 if (DumpCodeInstEmitter) {
280 // Disassemble function name label to text.
281 DisasmLines.push_back(x: MF->getName().str() + ":");
282 DisasmLineMaxLen = std::max(a: DisasmLineMaxLen, b: DisasmLines.back().size());
283 HexLines.push_back(x: "");
284 }
285
286 AsmPrinter::emitFunctionEntryLabel();
287}
288
289void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
290 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(MBB: &MBB)) {
291 // Write a line for the basic block label if it is not only fallthrough.
292 DisasmLines.push_back(
293 x: (Twine("BB") + Twine(getFunctionNumber())
294 + "_" + Twine(MBB.getNumber()) + ":").str());
295 DisasmLineMaxLen = std::max(a: DisasmLineMaxLen, b: DisasmLines.back().size());
296 HexLines.push_back(x: "");
297 }
298 AsmPrinter::emitBasicBlockStart(MBB);
299}
300
301void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
302 if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
303 if (GV->hasInitializer() && !isa<UndefValue>(Val: GV->getInitializer())) {
304 OutContext.reportError(L: {},
305 Msg: Twine(GV->getName()) +
306 ": unsupported initializer for address space");
307 return;
308 }
309
310 // LDS variables aren't emitted in HSA or PAL yet.
311 const Triple::OSType OS = TM.getTargetTriple().getOS();
312 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
313 return;
314
315 MCSymbol *GVSym = getSymbol(GV);
316
317 GVSym->redefineIfPossible();
318 if (GVSym->isDefined() || GVSym->isVariable())
319 report_fatal_error(reason: "symbol '" + Twine(GVSym->getName()) +
320 "' is already defined");
321
322 const DataLayout &DL = GV->getParent()->getDataLayout();
323 uint64_t Size = DL.getTypeAllocSize(Ty: GV->getValueType());
324 Align Alignment = GV->getAlign().value_or(u: Align(4));
325
326 emitVisibility(Sym: GVSym, Visibility: GV->getVisibility(), IsDefinition: !GV->isDeclaration());
327 emitLinkage(GV, GVSym);
328 auto TS = getTargetStreamer();
329 TS->emitAMDGPULDS(Symbol: GVSym, Size, Alignment);
330 return;
331 }
332
333 AsmPrinter::emitGlobalVariable(GV);
334}
335
336bool AMDGPUAsmPrinter::doInitialization(Module &M) {
337 CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);
338
339 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
340 switch (CodeObjectVersion) {
341 case AMDGPU::AMDHSA_COV4:
342 HSAMetadataStream.reset(p: new HSAMD::MetadataStreamerMsgPackV4());
343 break;
344 case AMDGPU::AMDHSA_COV5:
345 HSAMetadataStream.reset(p: new HSAMD::MetadataStreamerMsgPackV5());
346 break;
347 case AMDGPU::AMDHSA_COV6:
348 HSAMetadataStream.reset(p: new HSAMD::MetadataStreamerMsgPackV6());
349 break;
350 default:
351 report_fatal_error(reason: "Unexpected code object version");
352 }
353 }
354 return AsmPrinter::doInitialization(M);
355}
356
357bool AMDGPUAsmPrinter::doFinalization(Module &M) {
358 // Pad with s_code_end to help tools and guard against instruction prefetch
359 // causing stale data in caches. Arguably this should be done by the linker,
360 // which is why this isn't done for Mesa.
361 const MCSubtargetInfo &STI = *getGlobalSTI();
362 if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
363 (STI.getTargetTriple().getOS() == Triple::AMDHSA ||
364 STI.getTargetTriple().getOS() == Triple::AMDPAL)) {
365 OutStreamer->switchSection(Section: getObjFileLowering().getTextSection());
366 getTargetStreamer()->EmitCodeEnd(STI);
367 }
368
369 return AsmPrinter::doFinalization(M);
370}
371
372// Print comments that apply to both callable functions and entry points.
373void AMDGPUAsmPrinter::emitCommonFunctionComments(
374 uint32_t NumVGPR, std::optional<uint32_t> NumAGPR, uint32_t TotalNumVGPR,
375 uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize,
376 const AMDGPUMachineFunction *MFI) {
377 OutStreamer->emitRawComment(T: " codeLenInByte = " + Twine(CodeSize), TabPrefix: false);
378 OutStreamer->emitRawComment(T: " NumSgprs: " + Twine(NumSGPR), TabPrefix: false);
379 OutStreamer->emitRawComment(T: " NumVgprs: " + Twine(NumVGPR), TabPrefix: false);
380 if (NumAGPR) {
381 OutStreamer->emitRawComment(T: " NumAgprs: " + Twine(*NumAGPR), TabPrefix: false);
382 OutStreamer->emitRawComment(T: " TotalNumVgprs: " + Twine(TotalNumVGPR),
383 TabPrefix: false);
384 }
385 OutStreamer->emitRawComment(T: " ScratchSize: " + Twine(ScratchSize), TabPrefix: false);
386 OutStreamer->emitRawComment(T: " MemoryBound: " + Twine(MFI->isMemoryBound()),
387 TabPrefix: false);
388}
389
390uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
391 const MachineFunction &MF) const {
392 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
393 uint16_t KernelCodeProperties = 0;
394 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
395
396 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
397 KernelCodeProperties |=
398 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
399 }
400 if (UserSGPRInfo.hasDispatchPtr()) {
401 KernelCodeProperties |=
402 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
403 }
404 if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) {
405 KernelCodeProperties |=
406 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
407 }
408 if (UserSGPRInfo.hasKernargSegmentPtr()) {
409 KernelCodeProperties |=
410 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
411 }
412 if (UserSGPRInfo.hasDispatchID()) {
413 KernelCodeProperties |=
414 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
415 }
416 if (UserSGPRInfo.hasFlatScratchInit()) {
417 KernelCodeProperties |=
418 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
419 }
420 if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
421 KernelCodeProperties |=
422 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
423 }
424
425 if (CurrentProgramInfo.DynamicCallStack &&
426 CodeObjectVersion >= AMDGPU::AMDHSA_COV5)
427 KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK;
428
429 return KernelCodeProperties;
430}
431
432MCKernelDescriptor
433AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
434 const SIProgramInfo &PI) const {
435 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
436 const Function &F = MF.getFunction();
437 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
438 MCContext &Ctx = MF.getContext();
439
440 MCKernelDescriptor KernelDescriptor;
441
442 assert(isUInt<32>(PI.ScratchSize));
443 assert(isUInt<32>(PI.getComputePGMRSrc1(STM)));
444 assert(isUInt<32>(PI.getComputePGMRSrc2()));
445
446 KernelDescriptor.group_segment_fixed_size =
447 MCConstantExpr::create(Value: PI.LDSSize, Ctx);
448 KernelDescriptor.private_segment_fixed_size =
449 MCConstantExpr::create(Value: PI.ScratchSize, Ctx);
450
451 Align MaxKernArgAlign;
452 KernelDescriptor.kernarg_size = MCConstantExpr::create(
453 Value: STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx);
454
455 KernelDescriptor.compute_pgm_rsrc1 =
456 MCConstantExpr::create(Value: PI.getComputePGMRSrc1(ST: STM), Ctx);
457 KernelDescriptor.compute_pgm_rsrc2 =
458 MCConstantExpr::create(Value: PI.getComputePGMRSrc2(), Ctx);
459 KernelDescriptor.kernel_code_properties =
460 MCConstantExpr::create(Value: getAmdhsaKernelCodeProperties(MF), Ctx);
461
462 assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
463 KernelDescriptor.compute_pgm_rsrc3 = MCConstantExpr::create(
464 Value: STM.hasGFX90AInsts() ? CurrentProgramInfo.ComputePGMRSrc3GFX90A : 0, Ctx);
465
466 KernelDescriptor.kernarg_preload = MCConstantExpr::create(
467 Value: AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
468 Ctx);
469
470 return KernelDescriptor;
471}
472
473bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
474 // Init target streamer lazily on the first function so that previous passes
475 // can set metadata.
476 if (!IsTargetStreamerInitialized)
477 initTargetStreamer(M&: *MF.getFunction().getParent());
478
479 ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>();
480 CurrentProgramInfo = SIProgramInfo();
481
482 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
483
484 // The starting address of all shader programs must be 256 bytes aligned.
485 // Regular functions just need the basic required instruction alignment.
486 MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
487
488 SetupMachineFunction(MF);
489
490 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
491 MCContext &Context = getObjFileLowering().getContext();
492 // FIXME: This should be an explicit check for Mesa.
493 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
494 MCSectionELF *ConfigSection =
495 Context.getELFSection(Section: ".AMDGPU.config", Type: ELF::SHT_PROGBITS, Flags: 0);
496 OutStreamer->switchSection(Section: ConfigSection);
497 }
498
499 if (MFI->isModuleEntryFunction()) {
500 getSIProgramInfo(Out&: CurrentProgramInfo, MF);
501 }
502
503 if (STM.isAmdPalOS()) {
504 if (MFI->isEntryFunction())
505 EmitPALMetadata(MF, KernelInfo: CurrentProgramInfo);
506 else if (MFI->isModuleEntryFunction())
507 emitPALFunctionMetadata(MF);
508 } else if (!STM.isAmdHsaOS()) {
509 EmitProgramInfoSI(MF, KernelInfo: CurrentProgramInfo);
510 }
511
512 DumpCodeInstEmitter = nullptr;
513 if (STM.dumpCode()) {
514 // For -dumpcode, get the assembler out of the streamer, even if it does
515 // not really want to let us have it. This only works with -filetype=obj.
516 bool SaveFlag = OutStreamer->getUseAssemblerInfoForParsing();
517 OutStreamer->setUseAssemblerInfoForParsing(true);
518 MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
519 OutStreamer->setUseAssemblerInfoForParsing(SaveFlag);
520 if (Assembler)
521 DumpCodeInstEmitter = Assembler->getEmitterPtr();
522 }
523
524 DisasmLines.clear();
525 HexLines.clear();
526 DisasmLineMaxLen = 0;
527
528 emitFunctionBody();
529
530 emitResourceUsageRemarks(MF, CurrentProgramInfo, isModuleEntryFunction: MFI->isModuleEntryFunction(),
531 hasMAIInsts: STM.hasMAIInsts());
532
533 if (isVerbose()) {
534 MCSectionELF *CommentSection =
535 Context.getELFSection(Section: ".AMDGPU.csdata", Type: ELF::SHT_PROGBITS, Flags: 0);
536 OutStreamer->switchSection(Section: CommentSection);
537
538 if (!MFI->isEntryFunction()) {
539 OutStreamer->emitRawComment(T: " Function info:", TabPrefix: false);
540 const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
541 ResourceUsage->getResourceInfo(F: &MF.getFunction());
542 emitCommonFunctionComments(
543 NumVGPR: Info.NumVGPR,
544 NumAGPR: STM.hasMAIInsts() ? Info.NumAGPR : std::optional<uint32_t>(),
545 TotalNumVGPR: Info.getTotalNumVGPRs(ST: STM),
546 NumSGPR: Info.getTotalNumSGPRs(ST: MF.getSubtarget<GCNSubtarget>()),
547 ScratchSize: Info.PrivateSegmentSize, CodeSize: getFunctionCodeSize(MF), MFI);
548 return false;
549 }
550
551 OutStreamer->emitRawComment(T: " Kernel info:", TabPrefix: false);
552 emitCommonFunctionComments(
553 NumVGPR: CurrentProgramInfo.NumArchVGPR,
554 NumAGPR: STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR
555 : std::optional<uint32_t>(),
556 TotalNumVGPR: CurrentProgramInfo.NumVGPR, NumSGPR: CurrentProgramInfo.NumSGPR,
557 ScratchSize: CurrentProgramInfo.ScratchSize, CodeSize: getFunctionCodeSize(MF), MFI);
558
559 OutStreamer->emitRawComment(
560 T: " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), TabPrefix: false);
561 OutStreamer->emitRawComment(
562 T: " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), TabPrefix: false);
563 OutStreamer->emitRawComment(
564 T: " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
565 " bytes/workgroup (compile time only)", TabPrefix: false);
566
567 OutStreamer->emitRawComment(
568 T: " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), TabPrefix: false);
569 OutStreamer->emitRawComment(
570 T: " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), TabPrefix: false);
571
572 OutStreamer->emitRawComment(
573 T: " NumSGPRsForWavesPerEU: " +
574 Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), TabPrefix: false);
575 OutStreamer->emitRawComment(
576 T: " NumVGPRsForWavesPerEU: " +
577 Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), TabPrefix: false);
578
579 if (STM.hasGFX90AInsts())
580 OutStreamer->emitRawComment(
581 T: " AccumOffset: " +
582 Twine((CurrentProgramInfo.AccumOffset + 1) * 4), TabPrefix: false);
583
584 OutStreamer->emitRawComment(
585 T: " Occupancy: " +
586 Twine(CurrentProgramInfo.Occupancy), TabPrefix: false);
587
588 OutStreamer->emitRawComment(
589 T: " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), TabPrefix: false);
590
591 OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
592 Twine(CurrentProgramInfo.ScratchEnable),
593 TabPrefix: false);
594 OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:USER_SGPR: " +
595 Twine(CurrentProgramInfo.UserSGPR),
596 TabPrefix: false);
597 OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
598 Twine(CurrentProgramInfo.TrapHandlerEnable),
599 TabPrefix: false);
600 OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
601 Twine(CurrentProgramInfo.TGIdXEnable),
602 TabPrefix: false);
603 OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
604 Twine(CurrentProgramInfo.TGIdYEnable),
605 TabPrefix: false);
606 OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
607 Twine(CurrentProgramInfo.TGIdZEnable),
608 TabPrefix: false);
609 OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
610 Twine(CurrentProgramInfo.TIdIGCompCount),
611 TabPrefix: false);
612
613 assert(STM.hasGFX90AInsts() ||
614 CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
615 if (STM.hasGFX90AInsts()) {
616 OutStreamer->emitRawComment(
617 T: " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
618 Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
619 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))),
620 TabPrefix: false);
621 OutStreamer->emitRawComment(
622 T: " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
623 Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
624 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))),
625 TabPrefix: false);
626 }
627 }
628
629 if (DumpCodeInstEmitter) {
630
631 OutStreamer->switchSection(
632 Section: Context.getELFSection(Section: ".AMDGPU.disasm", Type: ELF::SHT_PROGBITS, Flags: 0));
633
634 for (size_t i = 0; i < DisasmLines.size(); ++i) {
635 std::string Comment = "\n";
636 if (!HexLines[i].empty()) {
637 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
638 Comment += " ; " + HexLines[i] + "\n";
639 }
640
641 OutStreamer->emitBytes(Data: StringRef(DisasmLines[i]));
642 OutStreamer->emitBytes(Data: StringRef(Comment));
643 }
644 }
645
646 return false;
647}
648
649// TODO: Fold this into emitFunctionBodyStart.
650void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
651 // In the beginning all features are either 'Any' or 'NotSupported',
652 // depending on global target features. This will cover empty modules.
653 getTargetStreamer()->initializeTargetID(STI: *getGlobalSTI(),
654 FeatureString: getGlobalSTI()->getFeatureString());
655
656 // If module is empty, we are done.
657 if (M.empty())
658 return;
659
660 // If module is not empty, need to find first 'Off' or 'On' feature
661 // setting per feature from functions in module.
662 for (auto &F : M) {
663 auto &TSTargetID = getTargetStreamer()->getTargetID();
664 if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
665 (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
666 break;
667
668 const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
669 const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
670 if (TSTargetID->isXnackSupported())
671 if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
672 TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
673 if (TSTargetID->isSramEccSupported())
674 if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
675 TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
676 }
677}
678
679uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
680 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
681 const SIInstrInfo *TII = STM.getInstrInfo();
682
683 uint64_t CodeSize = 0;
684
685 for (const MachineBasicBlock &MBB : MF) {
686 for (const MachineInstr &MI : MBB) {
687 // TODO: CodeSize should account for multiple functions.
688
689 // TODO: Should we count size of debug info?
690 if (MI.isDebugInstr())
691 continue;
692
693 CodeSize += TII->getInstSizeInBytes(MI);
694 }
695 }
696
697 return CodeSize;
698}
699
700void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
701 const MachineFunction &MF) {
702 const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
703 ResourceUsage->getResourceInfo(F: &MF.getFunction());
704 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
705
706 ProgInfo.NumArchVGPR = Info.NumVGPR;
707 ProgInfo.NumAccVGPR = Info.NumAGPR;
708 ProgInfo.NumVGPR = Info.getTotalNumVGPRs(ST: STM);
709 ProgInfo.AccumOffset = alignTo(Value: std::max(a: 1, b: Info.NumVGPR), Align: 4) / 4 - 1;
710 ProgInfo.TgSplit = STM.isTgSplitEnabled();
711 ProgInfo.NumSGPR = Info.NumExplicitSGPR;
712 ProgInfo.ScratchSize = Info.PrivateSegmentSize;
713 ProgInfo.VCCUsed = Info.UsesVCC;
714 ProgInfo.FlatUsed = Info.UsesFlatScratch;
715 ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;
716
717 const uint64_t MaxScratchPerWorkitem =
718 STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
719 if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) {
720 DiagnosticInfoStackSize DiagStackSize(MF.getFunction(),
721 ProgInfo.ScratchSize,
722 MaxScratchPerWorkitem, DS_Error);
723 MF.getFunction().getContext().diagnose(DI: DiagStackSize);
724 }
725
726 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
727
728 // The calculations related to SGPR/VGPR blocks are
729 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
730 // unified.
731 unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
732 &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed,
733 getTargetStreamer()->getTargetID()->isXnackOnOrAny());
734
735 // Check the addressable register limit before we add ExtraSGPRs.
736 if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
737 !STM.hasSGPRInitBug()) {
738 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
739 if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
740 // This can happen due to a compiler bug or when using inline asm.
741 LLVMContext &Ctx = MF.getFunction().getContext();
742 DiagnosticInfoResourceLimit Diag(
743 MF.getFunction(), "addressable scalar registers", ProgInfo.NumSGPR,
744 MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);
745 Ctx.diagnose(DI: Diag);
746 ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1;
747 }
748 }
749
750 // Account for extra SGPRs and VGPRs reserved for debugger use.
751 ProgInfo.NumSGPR += ExtraSGPRs;
752
753 const Function &F = MF.getFunction();
754
755 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
756 // dispatch registers are function args.
757 unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
758
759 if (isShader(CC: F.getCallingConv())) {
760 bool IsPixelShader =
761 F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
762
763 // Calculate the number of VGPR registers based on the SPI input registers
764 uint32_t InputEna = 0;
765 uint32_t InputAddr = 0;
766 unsigned LastEna = 0;
767
768 if (IsPixelShader) {
769 // Note for IsPixelShader:
770 // By this stage, all enabled inputs are tagged in InputAddr as well.
771 // We will use InputAddr to determine whether the input counts against the
772 // vgpr total and only use the InputEnable to determine the last input
773 // that is relevant - if extra arguments are used, then we have to honour
774 // the InputAddr for any intermediate non-enabled inputs.
775 InputEna = MFI->getPSInputEnable();
776 InputAddr = MFI->getPSInputAddr();
777
778 // We only need to consider input args up to the last used arg.
779 assert((InputEna || InputAddr) &&
780 "PSInputAddr and PSInputEnable should "
781 "never both be 0 for AMDGPU_PS shaders");
782 // There are some rare circumstances where InputAddr is non-zero and
783 // InputEna can be set to 0. In this case we default to setting LastEna
784 // to 1.
785 LastEna = InputEna ? llvm::Log2_32(Value: InputEna) + 1 : 1;
786 }
787
788 // FIXME: We should be using the number of registers determined during
789 // calling convention lowering to legalize the types.
790 const DataLayout &DL = F.getParent()->getDataLayout();
791 unsigned PSArgCount = 0;
792 unsigned IntermediateVGPR = 0;
793 for (auto &Arg : F.args()) {
794 unsigned NumRegs = (DL.getTypeSizeInBits(Ty: Arg.getType()) + 31) / 32;
795 if (Arg.hasAttribute(Attribute::Kind: InReg)) {
796 WaveDispatchNumSGPR += NumRegs;
797 } else {
798 // If this is a PS shader and we're processing the PS Input args (first
799 // 16 VGPR), use the InputEna and InputAddr bits to define how many
800 // VGPRs are actually used.
801 // Any extra VGPR arguments are handled as normal arguments (and
802 // contribute to the VGPR count whether they're used or not).
803 if (IsPixelShader && PSArgCount < 16) {
804 if ((1 << PSArgCount) & InputAddr) {
805 if (PSArgCount < LastEna)
806 WaveDispatchNumVGPR += NumRegs;
807 else
808 IntermediateVGPR += NumRegs;
809 }
810 PSArgCount++;
811 } else {
812 // If there are extra arguments we have to include the allocation for
813 // the non-used (but enabled with InputAddr) input arguments
814 if (IntermediateVGPR) {
815 WaveDispatchNumVGPR += IntermediateVGPR;
816 IntermediateVGPR = 0;
817 }
818 WaveDispatchNumVGPR += NumRegs;
819 }
820 }
821 }
822 ProgInfo.NumSGPR = std::max(a: ProgInfo.NumSGPR, b: WaveDispatchNumSGPR);
823 ProgInfo.NumArchVGPR = std::max(a: ProgInfo.NumVGPR, b: WaveDispatchNumVGPR);
824 ProgInfo.NumVGPR =
825 Info.getTotalNumVGPRs(ST: STM, NumAGPR: Info.NumAGPR, NumVGPR: ProgInfo.NumArchVGPR);
826 }
827
828 // Adjust number of registers used to meet default/requested minimum/maximum
829 // number of waves per execution unit request.
830 ProgInfo.NumSGPRsForWavesPerEU = std::max(
831 a: std::max(a: ProgInfo.NumSGPR, b: 1u), b: STM.getMinNumSGPRs(WavesPerEU: MFI->getMaxWavesPerEU()));
832 ProgInfo.NumVGPRsForWavesPerEU = std::max(
833 a: std::max(a: ProgInfo.NumVGPR, b: 1u), b: STM.getMinNumVGPRs(WavesPerEU: MFI->getMaxWavesPerEU()));
834
835 if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
836 STM.hasSGPRInitBug()) {
837 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
838 if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
839 // This can happen due to a compiler bug or when using inline asm to use
840 // the registers which are usually reserved for vcc etc.
841 LLVMContext &Ctx = MF.getFunction().getContext();
842 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",
843 ProgInfo.NumSGPR, MaxAddressableNumSGPRs,
844 DS_Error, DK_ResourceLimit);
845 Ctx.diagnose(DI: Diag);
846 ProgInfo.NumSGPR = MaxAddressableNumSGPRs;
847 ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs;
848 }
849 }
850
851 if (STM.hasSGPRInitBug()) {
852 ProgInfo.NumSGPR =
853 AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
854 ProgInfo.NumSGPRsForWavesPerEU =
855 AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
856 }
857
858 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
859 LLVMContext &Ctx = MF.getFunction().getContext();
860 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
861 MFI->getNumUserSGPRs(),
862 STM.getMaxNumUserSGPRs(), DS_Error);
863 Ctx.diagnose(DI: Diag);
864 }
865
866 if (MFI->getLDSSize() >
867 static_cast<unsigned>(STM.getAddressableLocalMemorySize())) {
868 LLVMContext &Ctx = MF.getFunction().getContext();
869 DiagnosticInfoResourceLimit Diag(
870 MF.getFunction(), "local memory", MFI->getLDSSize(),
871 STM.getAddressableLocalMemorySize(), DS_Error);
872 Ctx.diagnose(DI: Diag);
873 }
874
875 ProgInfo.SGPRBlocks = IsaInfo::getNumSGPRBlocks(
876 &STM, ProgInfo.NumSGPRsForWavesPerEU);
877 ProgInfo.VGPRBlocks =
878 IsaInfo::getEncodedNumVGPRBlocks(&STM, ProgInfo.NumVGPRsForWavesPerEU);
879
880 const SIModeRegisterDefaults Mode = MFI->getMode();
881
882 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
883 // register.
884 ProgInfo.FloatMode = getFPMode(Mode);
885
886 ProgInfo.IEEEMode = Mode.IEEE;
887
888 // Make clamp modifier on NaN input returns 0.
889 ProgInfo.DX10Clamp = Mode.DX10Clamp;
890
891 unsigned LDSAlignShift;
892 if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
893 // LDS is allocated in 64 dword blocks.
894 LDSAlignShift = 8;
895 } else {
896 // LDS is allocated in 128 dword blocks.
897 LDSAlignShift = 9;
898 }
899
900 ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
901 ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
902
903 ProgInfo.LDSSize = MFI->getLDSSize();
904 ProgInfo.LDSBlocks =
905 alignTo(Value: ProgInfo.LDSSize, Align: 1ULL << LDSAlignShift) >> LDSAlignShift;
906
907 // Scratch is allocated in 64-dword or 256-dword blocks.
908 unsigned ScratchAlignShift =
909 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
910 // We need to program the hardware with the amount of scratch memory that
911 // is used by the entire wave. ProgInfo.ScratchSize is the amount of
912 // scratch memory used per thread.
913 ProgInfo.ScratchBlocks = divideCeil(
914 ProgInfo.ScratchSize * STM.getWavefrontSize(), 1ULL << ScratchAlignShift);
915
916 if (getIsaVersion(GPU: getGlobalSTI()->getCPU()).Major >= 10) {
917 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
918 ProgInfo.MemOrdered = 1;
919 }
920
921 // 0 = X, 1 = XY, 2 = XYZ
922 unsigned TIDIGCompCnt = 0;
923 if (MFI->hasWorkItemIDZ())
924 TIDIGCompCnt = 2;
925 else if (MFI->hasWorkItemIDY())
926 TIDIGCompCnt = 1;
927
928 // The private segment wave byte offset is the last of the system SGPRs. We
929 // initially assumed it was allocated, and may have used it. It shouldn't harm
930 // anything to disable it if we know the stack isn't used here. We may still
931 // have emitted code reading it to initialize scratch, but if that's unused
932 // reading garbage should be OK.
933 ProgInfo.ScratchEnable =
934 ProgInfo.ScratchBlocks > 0 || ProgInfo.DynamicCallStack;
935 ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
936 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
937 ProgInfo.TrapHandlerEnable =
938 STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled();
939 ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
940 ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
941 ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
942 ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
943 ProgInfo.TIdIGCompCount = TIDIGCompCnt;
944 ProgInfo.EXCPEnMSB = 0;
945 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
946 ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
947 ProgInfo.EXCPEnable = 0;
948
949 if (STM.hasGFX90AInsts()) {
950 AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A,
951 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
952 ProgInfo.AccumOffset);
953 AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A,
954 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
955 ProgInfo.TgSplit);
956 }
957
958 ProgInfo.Occupancy = STM.computeOccupancy(F: MF.getFunction(), LDSSize: ProgInfo.LDSSize,
959 NumSGPRs: ProgInfo.NumSGPRsForWavesPerEU,
960 NumVGPRs: ProgInfo.NumVGPRsForWavesPerEU);
961 const auto [MinWEU, MaxWEU] =
962 AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-waves-per-eu", Default: {0, 0}, OnlyFirstRequired: true);
963 if (ProgInfo.Occupancy < MinWEU) {
964 DiagnosticInfoOptimizationFailure Diag(
965 F, F.getSubprogram(),
966 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
967 "'" +
968 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
969 ", final occupancy is " + Twine(ProgInfo.Occupancy));
970 F.getContext().diagnose(DI: Diag);
971 }
972}
973
974static unsigned getRsrcReg(CallingConv::ID CallConv) {
975 switch (CallConv) {
976 default: [[fallthrough]];
977 case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;
978 case CallingConv::AMDGPU_LS: return R_00B528_SPI_SHADER_PGM_RSRC1_LS;
979 case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS;
980 case CallingConv::AMDGPU_ES: return R_00B328_SPI_SHADER_PGM_RSRC1_ES;
981 case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
982 case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
983 case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
984 }
985}
986
987void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
988 const SIProgramInfo &CurrentProgramInfo) {
989 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
990 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
991 unsigned RsrcReg = getRsrcReg(CallConv: MF.getFunction().getCallingConv());
992
993 if (AMDGPU::isCompute(CC: MF.getFunction().getCallingConv())) {
994 OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1);
995
996 OutStreamer->emitInt32(Value: CurrentProgramInfo.getComputePGMRSrc1(ST: STM));
997
998 OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2);
999 OutStreamer->emitInt32(Value: CurrentProgramInfo.getComputePGMRSrc2());
1000
1001 OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
1002 OutStreamer->emitInt32(
1003 Value: STM.getGeneration() >= AMDGPUSubtarget::GFX12
1004 ? S_00B860_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks)
1005 : STM.getGeneration() == AMDGPUSubtarget::GFX11
1006 ? S_00B860_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks)
1007 : S_00B860_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
1008
1009 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1010 // 0" comment but I don't see a corresponding field in the register spec.
1011 } else {
1012 OutStreamer->emitInt32(Value: RsrcReg);
1013 OutStreamer->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
1014 S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), Size: 4);
1015 OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE);
1016 OutStreamer->emitInt32(
1017 Value: STM.getGeneration() >= AMDGPUSubtarget::GFX12
1018 ? S_0286E8_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks)
1019 : STM.getGeneration() == AMDGPUSubtarget::GFX11
1020 ? S_0286E8_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks)
1021 : S_0286E8_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
1022 }
1023
1024 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1025 OutStreamer->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS);
1026 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1027 ? divideCeil(Numerator: CurrentProgramInfo.LDSBlocks, Denominator: 2)
1028 : CurrentProgramInfo.LDSBlocks;
1029 OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1030 OutStreamer->emitInt32(R_0286CC_SPI_PS_INPUT_ENA);
1031 OutStreamer->emitInt32(Value: MFI->getPSInputEnable());
1032 OutStreamer->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR);
1033 OutStreamer->emitInt32(Value: MFI->getPSInputAddr());
1034 }
1035
1036 OutStreamer->emitInt32(R_SPILLED_SGPRS);
1037 OutStreamer->emitInt32(Value: MFI->getNumSpilledSGPRs());
1038 OutStreamer->emitInt32(R_SPILLED_VGPRS);
1039 OutStreamer->emitInt32(Value: MFI->getNumSpilledVGPRs());
1040}
1041
1042// Helper function to add common PAL Metadata 3.0+
1043static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
1044 const SIProgramInfo &CurrentProgramInfo,
1045 CallingConv::ID CC, const GCNSubtarget &ST) {
1046 if (ST.hasIEEEMode())
1047 MD->setHwStage(CC, field: ".ieee_mode", Val: (bool)CurrentProgramInfo.IEEEMode);
1048
1049 MD->setHwStage(CC, field: ".wgp_mode", Val: (bool)CurrentProgramInfo.WgpMode);
1050 MD->setHwStage(CC, field: ".mem_ordered", Val: (bool)CurrentProgramInfo.MemOrdered);
1051
1052 if (AMDGPU::isCompute(CC)) {
1053 MD->setHwStage(CC, field: ".trap_present",
1054 Val: (bool)CurrentProgramInfo.TrapHandlerEnable);
1055 MD->setHwStage(CC, field: ".excp_en", Val: CurrentProgramInfo.EXCPEnable);
1056
1057 MD->setHwStage(CC, field: ".lds_size",
1058 Val: (unsigned)(CurrentProgramInfo.LdsSize *
1059 getLdsDwGranularity(ST) * sizeof(uint32_t)));
1060 }
1061}
1062
1063// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1064// is AMDPAL. It stores each compute/SPI register setting and other PAL
1065// metadata items into the PALMD::Metadata, combining with any provided by the
1066// frontend as LLVM metadata. Once all functions are written, the PAL metadata
1067// is then written as a single block in the .note section.
1068void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1069 const SIProgramInfo &CurrentProgramInfo) {
1070 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1071 auto CC = MF.getFunction().getCallingConv();
1072 auto MD = getTargetStreamer()->getPALMetadata();
1073
1074 MD->setEntryPoint(CC, Name: MF.getFunction().getName());
1075 MD->setNumUsedVgprs(CC, Val: CurrentProgramInfo.NumVGPRsForWavesPerEU);
1076
1077 // Only set AGPRs for supported devices
1078 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1079 if (STM.hasMAIInsts()) {
1080 MD->setNumUsedAgprs(CC, Val: CurrentProgramInfo.NumAccVGPR);
1081 }
1082
1083 MD->setNumUsedSgprs(CC, Val: CurrentProgramInfo.NumSGPRsForWavesPerEU);
1084 if (MD->getPALMajorVersion() < 3) {
1085 MD->setRsrc1(CC, Val: CurrentProgramInfo.getPGMRSrc1(CC, ST: STM));
1086 if (AMDGPU::isCompute(CC)) {
1087 MD->setRsrc2(CC, Val: CurrentProgramInfo.getComputePGMRSrc2());
1088 } else {
1089 if (CurrentProgramInfo.ScratchBlocks > 0)
1090 MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1));
1091 }
1092 } else {
1093 MD->setHwStage(CC, field: ".debug_mode", Val: (bool)CurrentProgramInfo.DebugMode);
1094 MD->setHwStage(CC, field: ".scratch_en", Val: (bool)CurrentProgramInfo.ScratchEnable);
1095 EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, ST: STM);
1096 }
1097
1098 // ScratchSize is in bytes, 16 aligned.
1099 MD->setScratchSize(CC, Val: alignTo(Value: CurrentProgramInfo.ScratchSize, Align: 16));
1100 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1101 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1102 ? divideCeil(Numerator: CurrentProgramInfo.LDSBlocks, Denominator: 2)
1103 : CurrentProgramInfo.LDSBlocks;
1104 if (MD->getPALMajorVersion() < 3) {
1105 MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1106 MD->setSpiPsInputEna(MFI->getPSInputEnable());
1107 MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1108 } else {
1109 // Graphics registers
1110 const unsigned ExtraLdsDwGranularity =
1111 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128;
1112 MD->setGraphicsRegisters(
1113 field: ".ps_extra_lds_size",
1114 Val: (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
1115
1116 // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1117 static StringLiteral const PsInputFields[] = {
1118 ".persp_sample_ena", ".persp_center_ena",
1119 ".persp_centroid_ena", ".persp_pull_model_ena",
1120 ".linear_sample_ena", ".linear_center_ena",
1121 ".linear_centroid_ena", ".line_stipple_tex_ena",
1122 ".pos_x_float_ena", ".pos_y_float_ena",
1123 ".pos_z_float_ena", ".pos_w_float_ena",
1124 ".front_face_ena", ".ancillary_ena",
1125 ".sample_coverage_ena", ".pos_fixed_pt_ena"};
1126 unsigned PSInputEna = MFI->getPSInputEnable();
1127 unsigned PSInputAddr = MFI->getPSInputAddr();
1128 for (auto [Idx, Field] : enumerate(First: PsInputFields)) {
1129 MD->setGraphicsRegisters(field1: ".spi_ps_input_ena", field2: Field,
1130 Val: (bool)((PSInputEna >> Idx) & 1));
1131 MD->setGraphicsRegisters(field1: ".spi_ps_input_addr", field2: Field,
1132 Val: (bool)((PSInputAddr >> Idx) & 1));
1133 }
1134 }
1135 }
1136
1137 // For version 3 and above the wave front size is already set in the metadata
1138 if (MD->getPALMajorVersion() < 3 && STM.isWave32())
1139 MD->setWave32(MF.getFunction().getCallingConv());
1140}
1141
1142void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1143 auto *MD = getTargetStreamer()->getPALMetadata();
1144 const MachineFrameInfo &MFI = MF.getFrameInfo();
1145 StringRef FnName = MF.getFunction().getName();
1146 MD->setFunctionScratchSize(FnName, Val: MFI.getStackSize());
1147 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1148
1149 if (MD->getPALMajorVersion() < 3) {
1150 // Set compute registers
1151 MD->setRsrc1(CC: CallingConv::AMDGPU_CS,
1152 Val: CurrentProgramInfo.getPGMRSrc1(CC: CallingConv::AMDGPU_CS, ST));
1153 MD->setRsrc2(CC: CallingConv::AMDGPU_CS,
1154 Val: CurrentProgramInfo.getComputePGMRSrc2());
1155 } else {
1156 EmitPALMetadataCommon(MD, CurrentProgramInfo, CC: CallingConv::AMDGPU_CS, ST);
1157 }
1158
1159 // Set optional info
1160 MD->setFunctionLdsSize(FnName, Val: CurrentProgramInfo.LDSSize);
1161 MD->setFunctionNumUsedVgprs(FnName, Val: CurrentProgramInfo.NumVGPRsForWavesPerEU);
1162 MD->setFunctionNumUsedSgprs(FnName, Val: CurrentProgramInfo.NumSGPRsForWavesPerEU);
1163}
1164
1165// This is supposed to be log2(Size)
1166static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
1167 switch (Size) {
1168 case 4:
1169 return AMD_ELEMENT_4_BYTES;
1170 case 8:
1171 return AMD_ELEMENT_8_BYTES;
1172 case 16:
1173 return AMD_ELEMENT_16_BYTES;
1174 default:
1175 llvm_unreachable("invalid private_element_size");
1176 }
1177}
1178
1179void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
1180 const SIProgramInfo &CurrentProgramInfo,
1181 const MachineFunction &MF) const {
1182 const Function &F = MF.getFunction();
1183 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1184 F.getCallingConv() == CallingConv::SPIR_KERNEL);
1185
1186 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1187 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1188
1189 AMDGPU::initDefaultAMDKernelCodeT(Out, &STM);
1190
1191 Out.compute_pgm_resource_registers =
1192 CurrentProgramInfo.getComputePGMRSrc1(ST: STM) |
1193 (CurrentProgramInfo.getComputePGMRSrc2() << 32);
1194 Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64;
1195
1196 if (CurrentProgramInfo.DynamicCallStack)
1197 Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK;
1198
1199 AMD_HSA_BITS_SET(Out.code_properties,
1200 AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
1201 getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1202
1203 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
1204 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
1205 Out.code_properties |=
1206 AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
1207 }
1208
1209 if (UserSGPRInfo.hasDispatchPtr())
1210 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
1211
1212 if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5)
1213 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
1214
1215 if (UserSGPRInfo.hasKernargSegmentPtr())
1216 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
1217
1218 if (UserSGPRInfo.hasDispatchID())
1219 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
1220
1221 if (UserSGPRInfo.hasFlatScratchInit())
1222 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
1223
1224 if (UserSGPRInfo.hasDispatchPtr())
1225 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
1226
1227 if (STM.isXNACKEnabled())
1228 Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
1229
1230 Align MaxKernArgAlign;
1231 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1232 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1233 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1234 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1235 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1236
1237 // kernarg_segment_alignment is specified as log of the alignment.
1238 // The minimum alignment is 16.
1239 // FIXME: The metadata treats the minimum as 4?
1240 Out.kernarg_segment_alignment = Log2(A: std::max(a: Align(16), b: MaxKernArgAlign));
1241}
1242
1243bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
1244 const char *ExtraCode, raw_ostream &O) {
1245 // First try the generic code, which knows about modifiers like 'c' and 'n'.
1246 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, OS&: O))
1247 return false;
1248
1249 if (ExtraCode && ExtraCode[0]) {
1250 if (ExtraCode[1] != 0)
1251 return true; // Unknown modifier.
1252
1253 switch (ExtraCode[0]) {
1254 case 'r':
1255 break;
1256 default:
1257 return true;
1258 }
1259 }
1260
1261 // TODO: Should be able to support other operand types like globals.
1262 const MachineOperand &MO = MI->getOperand(i: OpNo);
1263 if (MO.isReg()) {
1264 AMDGPUInstPrinter::printRegOperand(RegNo: MO.getReg(), O,
1265 MRI: *MF->getSubtarget().getRegisterInfo());
1266 return false;
1267 } else if (MO.isImm()) {
1268 int64_t Val = MO.getImm();
1269 if (AMDGPU::isInlinableIntLiteral(Literal: Val)) {
1270 O << Val;
1271 } else if (isUInt<16>(x: Val)) {
1272 O << format(Fmt: "0x%" PRIx16, Vals: static_cast<uint16_t>(Val));
1273 } else if (isUInt<32>(x: Val)) {
1274 O << format(Fmt: "0x%" PRIx32, Vals: static_cast<uint32_t>(Val));
1275 } else {
1276 O << format(Fmt: "0x%" PRIx64, Vals: static_cast<uint64_t>(Val));
1277 }
1278 return false;
1279 }
1280 return true;
1281}
1282
1283void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
1284 AU.addRequired<AMDGPUResourceUsageAnalysis>();
1285 AU.addPreserved<AMDGPUResourceUsageAnalysis>();
1286 AsmPrinter::getAnalysisUsage(AU);
1287}
1288
1289void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1290 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1291 bool isModuleEntryFunction, bool hasMAIInsts) {
1292 if (!ORE)
1293 return;
1294
1295 const char *Name = "kernel-resource-usage";
1296 const char *Indent = " ";
1297
1298 // If the remark is not specifically enabled, do not output to yaml
1299 LLVMContext &Ctx = MF.getFunction().getContext();
1300 if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(PassName: Name))
1301 return;
1302
1303 auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1304 StringRef RemarkLabel, auto Argument) {
1305 // Add an indent for every line besides the line with the kernel name. This
1306 // makes it easier to tell which resource usage go with which kernel since
1307 // the kernel name will always be displayed first.
1308 std::string LabelStr = RemarkLabel.str() + ": ";
1309 if (!RemarkName.equals(RHS: "FunctionName"))
1310 LabelStr = Indent + LabelStr;
1311
1312 ORE->emit([&]() {
1313 return MachineOptimizationRemarkAnalysis(Name, RemarkName,
1314 MF.getFunction().getSubprogram(),
1315 &MF.front())
1316 << LabelStr << ore::NV(RemarkName, Argument);
1317 });
1318 };
1319
1320 // FIXME: Formatting here is pretty nasty because clang does not accept
1321 // newlines from diagnostics. This forces us to emit multiple diagnostic
1322 // remarks to simulate newlines. If and when clang does accept newlines, this
1323 // formatting should be aggregated into one remark with newlines to avoid
1324 // printing multiple diagnostic location and diag opts.
1325 EmitResourceUsageRemark("FunctionName", "Function Name",
1326 MF.getFunction().getName());
1327 EmitResourceUsageRemark("NumSGPR", "SGPRs", CurrentProgramInfo.NumSGPR);
1328 EmitResourceUsageRemark("NumVGPR", "VGPRs", CurrentProgramInfo.NumArchVGPR);
1329 if (hasMAIInsts)
1330 EmitResourceUsageRemark("NumAGPR", "AGPRs", CurrentProgramInfo.NumAccVGPR);
1331 EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
1332 CurrentProgramInfo.ScratchSize);
1333 StringRef DynamicStackStr =
1334 CurrentProgramInfo.DynamicCallStack ? "True" : "False";
1335 EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
1336 EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
1337 CurrentProgramInfo.Occupancy);
1338 EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
1339 CurrentProgramInfo.SGPRSpill);
1340 EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
1341 CurrentProgramInfo.VGPRSpill);
1342 if (isModuleEntryFunction)
1343 EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
1344 CurrentProgramInfo.LDSSize);
1345}
1346

source code of llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp