AMDGPUAsmPrinter.cpp source code [llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp]

1	//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	///
11	/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12	/// code. When passed an MCAsmStreamer it prints assembly and when passed
13	/// an MCObjectStreamer it outputs binary code.
14	//
15	//===----------------------------------------------------------------------===//
16	//
17
18	#include "AMDGPUAsmPrinter.h"
19	#include "AMDGPU.h"
20	#include "AMDGPUHSAMetadataStreamer.h"
21	#include "AMDGPUResourceUsageAnalysis.h"
22	#include "AMDKernelCodeT.h"
23	#include "GCNSubtarget.h"
24	#include "MCTargetDesc/AMDGPUInstPrinter.h"
25	#include "MCTargetDesc/AMDGPUMCKernelDescriptor.h"
26	#include "MCTargetDesc/AMDGPUTargetStreamer.h"
27	#include "R600AsmPrinter.h"
28	#include "SIMachineFunctionInfo.h"
29	#include "TargetInfo/AMDGPUTargetInfo.h"
30	#include "Utils/AMDGPUBaseInfo.h"
31	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
32	#include "llvm/BinaryFormat/ELF.h"
33	#include "llvm/CodeGen/MachineFrameInfo.h"
34	#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
35	#include "llvm/IR/DiagnosticInfo.h"
36	#include "llvm/MC/MCAssembler.h"
37	#include "llvm/MC/MCContext.h"
38	#include "llvm/MC/MCSectionELF.h"
39	#include "llvm/MC/MCStreamer.h"
40	#include "llvm/MC/TargetRegistry.h"
41	#include "llvm/Support/AMDHSAKernelDescriptor.h"
42	#include "llvm/Target/TargetLoweringObjectFile.h"
43	#include "llvm/Target/TargetMachine.h"
44	#include "llvm/TargetParser/TargetParser.h"
45
46	using namespace llvm;
47	using namespace llvm::AMDGPU;
48
49	// This should get the default rounding mode from the kernel. We just set the
50	// default here, but this could change if the OpenCL rounding mode pragmas are
51	// used.
52	//
53	// The denormal mode here should match what is reported by the OpenCL runtime
54	// for the CL_FP_DENORM bit from CL_DEVICE_{HALF\|SINGLE\|DOUBLE}_FP_CONFIG, but
55	// can also be override to flush with the -cl-denorms-are-zero compiler flag.
56	//
57	// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
58	// precision, and leaves single precision to flush all and does not report
59	// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
60	// CL_FP_DENORM for both.
61	//
62	// FIXME: It seems some instructions do not support single precision denormals
63	// regardless of the mode (exp__f32, rcp__f32, rsq__f32, rsq_f32, sqrt_f32,
64	// and sin_f32, cos_f32 on most parts).
65
66	// We want to use these instructions, and using fp32 denormals also causes
67	// instructions to run at the double precision rate for the device so it's
68	// probably best to just report no single precision denormals.
69	static uint32_t getFPMode(SIModeRegisterDefaults Mode) {
70	return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) \|
71	FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) \|
72	FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) \|
73	FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue());
74	}
75
76	static AsmPrinter *
77	createAMDGPUAsmPrinterPass(TargetMachine &tm,
78	std::unique_ptr<MCStreamer> &&Streamer) {
79	return new AMDGPUAsmPrinter (tm, std::move(Streamer));
80	}
81
82	extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter() {
83	TargetRegistry::RegisterAsmPrinter(T&: getTheR600Target(),
84	Fn: llvm::createR600AsmPrinterPass);
85	TargetRegistry::RegisterAsmPrinter(T&: getTheGCNTarget(),
86	Fn: createAMDGPUAsmPrinterPass);
87	}
88
89	AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
90	std::unique_ptr<MCStreamer> Streamer)
91	: AsmPrinter (TM, std::move(Streamer)) {
92	assert(OutStreamer && "AsmPrinter constructed without streamer");
93	}
94
95	StringRef AMDGPUAsmPrinter::getPassName() const {
96	return "AMDGPU Assembly Printer";
97	}
98
99	const MCSubtargetInfo AMDGPUAsmPrinter::getGlobalSTI() const* {
100	return TM.getMCSubtargetInfo();
101	}
102
103	AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
104	if (!OutStreamer)
105	return nullptr;
106	return static_cast<AMDGPUTargetStreamer*>(OutStreamer ->getTargetStreamer());
107	}
108
109	void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {
110	IsTargetStreamerInitialized = false;
111	}
112
113	void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
114	IsTargetStreamerInitialized = true;
115
116	// TODO: Which one is called first, emitStartOfAsmFile or
117	// emitFunctionBodyStart?
118	if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
119	initializeTargetID(M);
120
121	if (TM.getTargetTriple().getOS() != Triple::AMDHSA &&
122	TM.getTargetTriple().getOS() != Triple::AMDPAL)
123	return;
124
125	getTargetStreamer()->EmitDirectiveAMDGCNTarget();
126
127	if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
128	getTargetStreamer()->EmitDirectiveAMDHSACodeObjectVersion(
129	COV: CodeObjectVersion);
130	HSAMetadataStream ->begin(Mod: M, TargetID: *getTargetStreamer()->getTargetID());
131	}
132
133	if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
134	getTargetStreamer()->getPALMetadata()->readFromIR(M);
135	}
136
137	void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
138	// Init target streamer if it has not yet happened
139	if (!IsTargetStreamerInitialized)
140	initTargetStreamer(M);
141
142	if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
143	getTargetStreamer()->EmitISAVersion();
144
145	// Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
146	// Emit HSA Metadata (NT_AMD_HSA_METADATA).
147	if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
148	HSAMetadataStream ->end();
149	bool Success = HSAMetadataStream ->emitTo(TargetStreamer&: *getTargetStreamer());
150	(void)Success;
151	assert(Success && "Malformed HSA Metadata");
152	}
153	}
154
155	void AMDGPUAsmPrinter::emitFunctionBodyStart() {
156	const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
157	const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
158	const Function &F = MF->getFunction();
159
160	// TODO: We're checking this late, would be nice to check it earlier.
161	if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
162	report_fatal_error(
163	STM.getCPU() + " is only available on code object version 6 or better",
164	/gen_crash_diag/ false);
165	}
166
167	// TODO: Which one is called first, emitStartOfAsmFile or
168	// emitFunctionBodyStart?
169	if (!getTargetStreamer()->getTargetID())
170	initializeTargetID(M: *F.getParent());
171
172	const auto &FunctionTargetID = STM.getTargetID();
173	// Make sure function's xnack settings are compatible with module's
174	// xnack settings.
175	if (FunctionTargetID.isXnackSupported() &&
176	FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
177	FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
178	OutContext.reportError(L: {}, Msg: "xnack setting of '" + Twine (MF->getName()) +
179	"' function does not match module xnack setting");
180	return;
181	}
182	// Make sure function's sramecc settings are compatible with module's
183	// sramecc settings.
184	if (FunctionTargetID.isSramEccSupported() &&
185	FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
186	FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
187	OutContext.reportError(L: {}, Msg: "sramecc setting of '" + Twine (MF->getName()) +
188	"' function does not match module sramecc setting");
189	return;
190	}
191
192	if (!MFI.isEntryFunction())
193	return;
194
195	if (STM.isMesaKernel(F) &&
196	(F.getCallingConv() == CallingConv::AMDGPU_KERNEL \|\|
197	F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
198	amd_kernel_code_t KernelCode;
199	getAmdKernelCode(Out&: KernelCode, KernelInfo: CurrentProgramInfo, MF: *MF);
200	getTargetStreamer()->EmitAMDKernelCodeT(Header: KernelCode);
201	}
202
203	if (STM.isAmdHsaOS())
204	HSAMetadataStream ->emitKernel(MF: *MF, ProgramInfo: CurrentProgramInfo);
205
206	if (MFI.getNumKernargPreloadedSGPRs() > `0`) {
207	assert(AMDGPU::hasKernargPreload(STM));
208	getTargetStreamer()->EmitKernargPreloadHeader(STI: *getGlobalSTI(),
209	TrapEnabled: STM.isAmdHsaOS());
210	}
211	}
212
213	void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
214	const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
215	if (!MFI.isEntryFunction())
216	return;
217
218	if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
219	return;
220
221	auto &Streamer = getTargetStreamer()->getStreamer();
222	auto &Context = Streamer.getContext();
223	auto &ObjectFileInfo = *Context.getObjectFileInfo();
224	auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
225
226	Streamer.pushSection();
227	Streamer.switchSection(Section: &ReadOnlySection);
228
229	// CP microcode requires the kernel descriptor to be allocated on 64 byte
230	// alignment.
231	Streamer.emitValueToAlignment(Alignment: Align (`64`), Value: `0`, ValueSize: `1`, MaxBytesToEmit: `0`);
232	ReadOnlySection.ensureMinAlignment(MinAlignment: Align (`64`));
233
234	const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
235
236	SmallString<`128`> KernelName;
237	getNameWithPrefix(Name&: KernelName, GV: &MF->getFunction());
238	getTargetStreamer()->EmitAmdhsaKernelDescriptor(
239	STM, KernelName, getAmdhsaKernelDescriptor(MF: *MF, PI: CurrentProgramInfo),
240	CurrentProgramInfo.NumVGPRsForWavesPerEU,
241	CurrentProgramInfo.NumSGPRsForWavesPerEU -
242	IsaInfo::getNumExtraSGPRs(
243	&STM, CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
244	getTargetStreamer()->getTargetID()->isXnackOnOrAny()),
245	CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
246
247	Streamer.popSection();
248	}
249
250	void AMDGPUAsmPrinter::emitImplicitDef(const MachineInstr MI) const* {
251	Register RegNo = MI->getOperand(i: `0`).getReg();
252
253	SmallString<`128`> Str;
254	raw_svector_ostream OS(Str);
255	OS << "implicit-def: "
256	<< printReg(Reg: RegNo, TRI: MF->getSubtarget().getRegisterInfo());
257
258	if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
259	OS << " : SGPR spill to VGPR lane";
260
261	OutStreamer ->AddComment(T: OS.str());
262	OutStreamer ->addBlankLine();
263	}
264
265	void AMDGPUAsmPrinter::emitFunctionEntryLabel() {
266	if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
267	AsmPrinter::emitFunctionEntryLabel();
268	return;
269	}
270
271	const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
272	const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
273	if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
274	SmallString<`128`> SymbolName;
275	getNameWithPrefix(Name&: SymbolName, GV: &MF->getFunction()),
276	getTargetStreamer()->EmitAMDGPUSymbolType(
277	SymbolName, Type: ELF::STT_AMDGPU_HSA_KERNEL);
278	}
279	if (DumpCodeInstEmitter) {
280	// Disassemble function name label to text.
281	DisasmLines.push_back(x: MF->getName().str() + ":");
282	DisasmLineMaxLen = std::max(a: DisasmLineMaxLen, b: DisasmLines.back().size());
283	HexLines.push_back(x: "");
284	}
285
286	AsmPrinter::emitFunctionEntryLabel();
287	}
288
289	void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
290	if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(MBB: &MBB)) {
291	// Write a line for the basic block label if it is not only fallthrough.
292	DisasmLines.push_back(
293	x: (Twine ("BB") + Twine (getFunctionNumber())
294	+ "_" + Twine (MBB.getNumber()) + ":").str());
295	DisasmLineMaxLen = std::max(a: DisasmLineMaxLen, b: DisasmLines.back().size());
296	HexLines.push_back(x: "");
297	}
298	AsmPrinter::emitBasicBlockStart(MBB);
299	}
300
301	void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
302	if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
303	if (GV->hasInitializer() && !isa<UndefValue>(Val: GV->getInitializer())) {
304	OutContext.reportError(L: {},
305	Msg: Twine (GV->getName()) +
306	": unsupported initializer for address space");
307	return;
308	}
309
310	// LDS variables aren't emitted in HSA or PAL yet.
311	const Triple::OSType OS = TM.getTargetTriple().getOS();
312	if (OS == Triple::AMDHSA \|\| OS == Triple::AMDPAL)
313	return;
314
315	MCSymbol *GVSym = getSymbol(GV);
316
317	GVSym->redefineIfPossible();
318	if (GVSym->isDefined() \|\| GVSym->isVariable())
319	report_fatal_error(reason: "symbol '" + Twine (GVSym->getName()) +
320	"' is already defined");
321
322	const DataLayout &DL = GV->getParent()->getDataLayout();
323	uint64_t Size = DL.getTypeAllocSize(Ty: GV->getValueType());
324	Align Alignment = GV->getAlign().value_or(u: Align (`4`));
325
326	emitVisibility(Sym: GVSym, Visibility: GV->getVisibility(), IsDefinition: !GV->isDeclaration());
327	emitLinkage(GV, GVSym);
328	auto TS = getTargetStreamer();
329	TS->emitAMDGPULDS(Symbol: GVSym, Size, Alignment);
330	return;
331	}
332
333	AsmPrinter::emitGlobalVariable(GV);
334	}
335
336	bool AMDGPUAsmPrinter::doInitialization(Module &M) {
337	CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);
338
339	if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
340	switch (CodeObjectVersion) {
341	case AMDGPU::AMDHSA_COV4:
342	HSAMetadataStream.reset(p: new HSAMD::MetadataStreamerMsgPackV4 ());
343	break;
344	case AMDGPU::AMDHSA_COV5:
345	HSAMetadataStream.reset(p: new HSAMD::MetadataStreamerMsgPackV5 ());
346	break;
347	case AMDGPU::AMDHSA_COV6:
348	HSAMetadataStream.reset(p: new HSAMD::MetadataStreamerMsgPackV6 ());
349	break;
350	default:
351	report_fatal_error(reason: "Unexpected code object version");
352	}
353	}
354	return AsmPrinter::doInitialization(M);
355	}
356
357	bool AMDGPUAsmPrinter::doFinalization(Module &M) {
358	// Pad with s_code_end to help tools and guard against instruction prefetch
359	// causing stale data in caches. Arguably this should be done by the linker,
360	// which is why this isn't done for Mesa.
361	const MCSubtargetInfo &STI = *getGlobalSTI();
362	if ((AMDGPU::isGFX10Plus(STI) \|\| AMDGPU::isGFX90A(STI)) &&
363	(STI.getTargetTriple().getOS() == Triple::AMDHSA \|\|
364	STI.getTargetTriple().getOS() == Triple::AMDPAL)) {
365	OutStreamer ->switchSection(Section: getObjFileLowering().getTextSection());
366	getTargetStreamer()->EmitCodeEnd(STI);
367	}
368
369	return AsmPrinter::doFinalization(M);
370	}
371
372	// Print comments that apply to both callable functions and entry points.
373	void AMDGPUAsmPrinter::emitCommonFunctionComments(
374	uint32_t NumVGPR, std::optional<uint32_t> NumAGPR, uint32_t TotalNumVGPR,
375	uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize,
376	const AMDGPUMachineFunction *MFI) {
377	OutStreamer ->emitRawComment(T: " codeLenInByte = " + Twine (CodeSize), TabPrefix: false);
378	OutStreamer ->emitRawComment(T: " NumSgprs: " + Twine (NumSGPR), TabPrefix: false);
379	OutStreamer ->emitRawComment(T: " NumVgprs: " + Twine (NumVGPR), TabPrefix: false);
380	if (NumAGPR) {
381	OutStreamer ->emitRawComment(T: " NumAgprs: " + Twine (NumAGPR), TabPrefix: false*);
382	OutStreamer ->emitRawComment(T: " TotalNumVgprs: " + Twine (TotalNumVGPR),
383	TabPrefix: false);
384	}
385	OutStreamer ->emitRawComment(T: " ScratchSize: " + Twine (ScratchSize), TabPrefix: false);
386	OutStreamer ->emitRawComment(T: " MemoryBound: " + Twine (MFI->isMemoryBound()),
387	TabPrefix: false);
388	}
389
390	uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
391	const MachineFunction &MF) const {
392	const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
393	uint16_t KernelCodeProperties = `0`;
394	const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
395
396	if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
397	KernelCodeProperties \|=
398	amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
399	}
400	if (UserSGPRInfo.hasDispatchPtr()) {
401	KernelCodeProperties \|=
402	amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
403	}
404	if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) {
405	KernelCodeProperties \|=
406	amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
407	}
408	if (UserSGPRInfo.hasKernargSegmentPtr()) {
409	KernelCodeProperties \|=
410	amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
411	}
412	if (UserSGPRInfo.hasDispatchID()) {
413	KernelCodeProperties \|=
414	amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
415	}
416	if (UserSGPRInfo.hasFlatScratchInit()) {
417	KernelCodeProperties \|=
418	amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
419	}
420	if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
421	KernelCodeProperties \|=
422	amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
423	}
424
425	if (CurrentProgramInfo.DynamicCallStack &&
426	CodeObjectVersion >= AMDGPU::AMDHSA_COV5)
427	KernelCodeProperties \|= amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK;
428
429	return KernelCodeProperties;
430	}
431
432	MCKernelDescriptor
433	AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
434	const SIProgramInfo &PI) const {
435	const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
436	const Function &F = MF.getFunction();
437	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
438	MCContext &Ctx = MF.getContext();
439
440	MCKernelDescriptor KernelDescriptor;
441
442	assert(isUInt<`32`>(PI.ScratchSize));
443	assert(isUInt<`32`>(PI.getComputePGMRSrc1(STM)));
444	assert(isUInt<`32`>(PI.getComputePGMRSrc2()));
445
446	KernelDescriptor.group_segment_fixed_size =
447	MCConstantExpr::create(Value: PI.LDSSize, Ctx);
448	KernelDescriptor.private_segment_fixed_size =
449	MCConstantExpr::create(Value: PI.ScratchSize, Ctx);
450
451	Align MaxKernArgAlign;
452	KernelDescriptor.kernarg_size = MCConstantExpr::create(
453	Value: STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx);
454
455	KernelDescriptor.compute_pgm_rsrc1 =
456	MCConstantExpr::create(Value: PI.getComputePGMRSrc1(ST: STM), Ctx);
457	KernelDescriptor.compute_pgm_rsrc2 =
458	MCConstantExpr::create(Value: PI.getComputePGMRSrc2(), Ctx);
459	KernelDescriptor.kernel_code_properties =
460	MCConstantExpr::create(Value: getAmdhsaKernelCodeProperties(MF), Ctx);
461
462	assert(STM.hasGFX90AInsts() \|\| CurrentProgramInfo.ComputePGMRSrc3GFX90A == `0`);
463	KernelDescriptor.compute_pgm_rsrc3 = MCConstantExpr::create(
464	Value: STM.hasGFX90AInsts() ? CurrentProgramInfo.ComputePGMRSrc3GFX90A : `0`, Ctx);
465
466	KernelDescriptor.kernarg_preload = MCConstantExpr::create(
467	Value: AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : `0`,
468	Ctx);
469
470	return KernelDescriptor;
471	}
472
473	bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
474	// Init target streamer lazily on the first function so that previous passes
475	// can set metadata.
476	if (!IsTargetStreamerInitialized)
477	initTargetStreamer(M&: *MF.getFunction().getParent());
478
479	ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>();
480	CurrentProgramInfo = SIProgramInfo ();
481
482	const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
483
484	// The starting address of all shader programs must be 256 bytes aligned.
485	// Regular functions just need the basic required instruction alignment.
486	MF.setAlignment(MFI->isEntryFunction() ? Align (`256`) : Align (`4`));
487
488	SetupMachineFunction(MF);
489
490	const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
491	MCContext &Context = getObjFileLowering().getContext();
492	// FIXME: This should be an explicit check for Mesa.
493	if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
494	MCSectionELF *ConfigSection =
495	Context.getELFSection(Section: ".AMDGPU.config", Type: ELF::SHT_PROGBITS, Flags: `0`);
496	OutStreamer ->switchSection(Section: ConfigSection);
497	}
498
499	if (MFI->isModuleEntryFunction()) {
500	getSIProgramInfo(Out&: CurrentProgramInfo, MF);
501	}
502
503	if (STM.isAmdPalOS()) {
504	if (MFI->isEntryFunction())
505	EmitPALMetadata(MF, KernelInfo: CurrentProgramInfo);
506	else if (MFI->isModuleEntryFunction())
507	emitPALFunctionMetadata(MF);
508	} else if (!STM.isAmdHsaOS()) {
509	EmitProgramInfoSI(MF, KernelInfo: CurrentProgramInfo);
510	}
511
512	DumpCodeInstEmitter = nullptr;
513	if (STM.dumpCode()) {
514	// For -dumpcode, get the assembler out of the streamer, even if it does
515	// not really want to let us have it. This only works with -filetype=obj.
516	bool SaveFlag = OutStreamer ->getUseAssemblerInfoForParsing();
517	OutStreamer ->setUseAssemblerInfoForParsing(true);
518	MCAssembler *Assembler = OutStreamer ->getAssemblerPtr();
519	OutStreamer ->setUseAssemblerInfoForParsing(SaveFlag);
520	if (Assembler)
521	DumpCodeInstEmitter = Assembler->getEmitterPtr();
522	}
523
524	DisasmLines.clear();
525	HexLines.clear();
526	DisasmLineMaxLen = `0`;
527
528	emitFunctionBody();
529
530	emitResourceUsageRemarks(MF, CurrentProgramInfo, isModuleEntryFunction: MFI->isModuleEntryFunction(),
531	hasMAIInsts: STM.hasMAIInsts());
532
533	if (isVerbose()) {
534	MCSectionELF *CommentSection =
535	Context.getELFSection(Section: ".AMDGPU.csdata", Type: ELF::SHT_PROGBITS, Flags: `0`);
536	OutStreamer ->switchSection(Section: CommentSection);
537
538	if (!MFI->isEntryFunction()) {
539	OutStreamer ->emitRawComment(T: " Function info:", TabPrefix: false);
540	const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
541	ResourceUsage->getResourceInfo(F: &MF.getFunction());
542	emitCommonFunctionComments(
543	NumVGPR: Info.NumVGPR,
544	NumAGPR: STM.hasMAIInsts() ? Info.NumAGPR : std::optional<uint32_t>(),
545	TotalNumVGPR: Info.getTotalNumVGPRs(ST: STM),
546	NumSGPR: Info.getTotalNumSGPRs(ST: MF.getSubtarget<GCNSubtarget>()),
547	ScratchSize: Info.PrivateSegmentSize, CodeSize: getFunctionCodeSize(MF), MFI);
548	return false;
549	}
550
551	OutStreamer ->emitRawComment(T: " Kernel info:", TabPrefix: false);
552	emitCommonFunctionComments(
553	NumVGPR: CurrentProgramInfo.NumArchVGPR,
554	NumAGPR: STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR
555	: std::optional<uint32_t>(),
556	TotalNumVGPR: CurrentProgramInfo.NumVGPR, NumSGPR: CurrentProgramInfo.NumSGPR,
557	ScratchSize: CurrentProgramInfo.ScratchSize, CodeSize: getFunctionCodeSize(MF), MFI);
558
559	OutStreamer ->emitRawComment(
560	T: " FloatMode: " + Twine (CurrentProgramInfo.FloatMode), TabPrefix: false);
561	OutStreamer ->emitRawComment(
562	T: " IeeeMode: " + Twine (CurrentProgramInfo.IEEEMode), TabPrefix: false);
563	OutStreamer ->emitRawComment(
564	T: " LDSByteSize: " + Twine (CurrentProgramInfo.LDSSize) +
565	" bytes/workgroup (compile time only)", TabPrefix: false);
566
567	OutStreamer ->emitRawComment(
568	T: " SGPRBlocks: " + Twine (CurrentProgramInfo.SGPRBlocks), TabPrefix: false);
569	OutStreamer ->emitRawComment(
570	T: " VGPRBlocks: " + Twine (CurrentProgramInfo.VGPRBlocks), TabPrefix: false);
571
572	OutStreamer ->emitRawComment(
573	T: " NumSGPRsForWavesPerEU: " +
574	Twine (CurrentProgramInfo.NumSGPRsForWavesPerEU), TabPrefix: false);
575	OutStreamer ->emitRawComment(
576	T: " NumVGPRsForWavesPerEU: " +
577	Twine (CurrentProgramInfo.NumVGPRsForWavesPerEU), TabPrefix: false);
578
579	if (STM.hasGFX90AInsts())
580	OutStreamer ->emitRawComment(
581	T: " AccumOffset: " +
582	Twine ((CurrentProgramInfo.AccumOffset + `1`) * `4`), TabPrefix: false);
583
584	OutStreamer ->emitRawComment(
585	T: " Occupancy: " +
586	Twine (CurrentProgramInfo.Occupancy), TabPrefix: false);
587
588	OutStreamer ->emitRawComment(
589	T: " WaveLimiterHint : " + Twine (MFI->needsWaveLimiter()), TabPrefix: false);
590
591	OutStreamer ->emitRawComment(T: " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
592	Twine (CurrentProgramInfo.ScratchEnable),
593	TabPrefix: false);
594	OutStreamer ->emitRawComment(T: " COMPUTE_PGM_RSRC2:USER_SGPR: " +
595	Twine (CurrentProgramInfo.UserSGPR),
596	TabPrefix: false);
597	OutStreamer ->emitRawComment(T: " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
598	Twine (CurrentProgramInfo.TrapHandlerEnable),
599	TabPrefix: false);
600	OutStreamer ->emitRawComment(T: " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
601	Twine (CurrentProgramInfo.TGIdXEnable),
602	TabPrefix: false);
603	OutStreamer ->emitRawComment(T: " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
604	Twine (CurrentProgramInfo.TGIdYEnable),
605	TabPrefix: false);
606	OutStreamer ->emitRawComment(T: " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
607	Twine (CurrentProgramInfo.TGIdZEnable),
608	TabPrefix: false);
609	OutStreamer ->emitRawComment(T: " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
610	Twine (CurrentProgramInfo.TIdIGCompCount),
611	TabPrefix: false);
612
613	assert(STM.hasGFX90AInsts() \|\|
614	CurrentProgramInfo.ComputePGMRSrc3GFX90A == `0`);
615	if (STM.hasGFX90AInsts()) {
616	OutStreamer ->emitRawComment(
617	T: " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
618	Twine ((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
619	amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))),
620	TabPrefix: false);
621	OutStreamer ->emitRawComment(
622	T: " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
623	Twine ((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
624	amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))),
625	TabPrefix: false);
626	}
627	}
628
629	if (DumpCodeInstEmitter) {
630
631	OutStreamer ->switchSection(
632	Section: Context.getELFSection(Section: ".AMDGPU.disasm", Type: ELF::SHT_PROGBITS, Flags: `0`));
633
634	for (size_t i = `0`; i < DisasmLines.size(); ++i) {
635	std::string Comment = "\n";
636	if (!HexLines [i].empty()) {
637	Comment = std::string (DisasmLineMaxLen - DisasmLines [i].size(), `' '`);
638	Comment += " ; " + HexLines [i] + "\n";
639	}
640
641	OutStreamer ->emitBytes(Data: StringRef (DisasmLines [i]));
642	OutStreamer ->emitBytes(Data: StringRef (Comment));
643	}
644	}
645
646	return false;
647	}
648
649	// TODO: Fold this into emitFunctionBodyStart.
650	void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
651	// In the beginning all features are either 'Any' or 'NotSupported',
652	// depending on global target features. This will cover empty modules.
653	getTargetStreamer()->initializeTargetID(STI: *getGlobalSTI(),
654	FeatureString: getGlobalSTI()->getFeatureString());
655
656	// If module is empty, we are done.
657	if (M.empty())
658	return;
659
660	// If module is not empty, need to find first 'Off' or 'On' feature
661	// setting per feature from functions in module.
662	for (auto &F : M) {
663	auto &TSTargetID = getTargetStreamer()->getTargetID();
664	if ((!TSTargetID ->isXnackSupported() \|\| TSTargetID ->isXnackOnOrOff()) &&
665	(!TSTargetID ->isSramEccSupported() \|\| TSTargetID ->isSramEccOnOrOff()))
666	break;
667
668	const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
669	const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
670	if (TSTargetID ->isXnackSupported())
671	if (TSTargetID ->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
672	TSTargetID ->setXnackSetting(STMTargetID.getXnackSetting());
673	if (TSTargetID ->isSramEccSupported())
674	if (TSTargetID ->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
675	TSTargetID ->setSramEccSetting(STMTargetID.getSramEccSetting());
676	}
677	}
678
679	uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
680	const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
681	const SIInstrInfo *TII = STM.getInstrInfo();
682
683	uint64_t CodeSize = `0`;
684
685	for (const MachineBasicBlock &MBB : MF) {
686	for (const MachineInstr &MI : MBB) {
687	// TODO: CodeSize should account for multiple functions.
688
689	// TODO: Should we count size of debug info?
690	if (MI.isDebugInstr())
691	continue;
692
693	CodeSize += TII->getInstSizeInBytes(MI);
694	}
695	}
696
697	return CodeSize;
698	}
699
700	void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
701	const MachineFunction &MF) {
702	const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
703	ResourceUsage->getResourceInfo(F: &MF.getFunction());
704	const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
705
706	ProgInfo.NumArchVGPR = Info.NumVGPR;
707	ProgInfo.NumAccVGPR = Info.NumAGPR;
708	ProgInfo.NumVGPR = Info.getTotalNumVGPRs(ST: STM);
709	ProgInfo.AccumOffset = alignTo(Value: std::max(a: `1`, b: Info.NumVGPR), Align: `4`) / `4` - `1`;
710	ProgInfo.TgSplit = STM.isTgSplitEnabled();
711	ProgInfo.NumSGPR = Info.NumExplicitSGPR;
712	ProgInfo.ScratchSize = Info.PrivateSegmentSize;
713	ProgInfo.VCCUsed = Info.UsesVCC;
714	ProgInfo.FlatUsed = Info.UsesFlatScratch;
715	ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack \|\| Info.HasRecursion;
716
717	const uint64_t MaxScratchPerWorkitem =
718	STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
719	if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) {
720	DiagnosticInfoStackSize DiagStackSize(MF.getFunction(),
721	ProgInfo.ScratchSize,
722	MaxScratchPerWorkitem, DS_Error);
723	MF.getFunction().getContext().diagnose(DI: DiagStackSize);
724	}
725
726	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
727
728	// The calculations related to SGPR/VGPR blocks are
729	// duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
730	// unified.
731	unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
732	&STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed,
733	getTargetStreamer()->getTargetID()->isXnackOnOrAny());
734
735	// Check the addressable register limit before we add ExtraSGPRs.
736	if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
737	!STM.hasSGPRInitBug()) {
738	unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
739	if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
740	// This can happen due to a compiler bug or when using inline asm.
741	LLVMContext &Ctx = MF.getFunction().getContext();
742	DiagnosticInfoResourceLimit Diag(
743	MF.getFunction(), "addressable scalar registers", ProgInfo.NumSGPR,
744	MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);
745	Ctx.diagnose(DI: Diag);
746	ProgInfo.NumSGPR = MaxAddressableNumSGPRs - `1`;
747	}
748	}
749
750	// Account for extra SGPRs and VGPRs reserved for debugger use.
751	ProgInfo.NumSGPR += ExtraSGPRs;
752
753	const Function &F = MF.getFunction();
754
755	// Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
756	// dispatch registers are function args.
757	unsigned WaveDispatchNumSGPR = `0`, WaveDispatchNumVGPR = `0`;
758
759	if (isShader(CC: F.getCallingConv())) {
760	bool IsPixelShader =
761	F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
762
763	// Calculate the number of VGPR registers based on the SPI input registers
764	uint32_t InputEna = `0`;
765	uint32_t InputAddr = `0`;
766	unsigned LastEna = `0`;
767
768	if (IsPixelShader) {
769	// Note for IsPixelShader:
770	// By this stage, all enabled inputs are tagged in InputAddr as well.
771	// We will use InputAddr to determine whether the input counts against the
772	// vgpr total and only use the InputEnable to determine the last input
773	// that is relevant - if extra arguments are used, then we have to honour
774	// the InputAddr for any intermediate non-enabled inputs.
775	InputEna = MFI->getPSInputEnable();
776	InputAddr = MFI->getPSInputAddr();
777
778	// We only need to consider input args up to the last used arg.
779	assert((InputEna \|\| InputAddr) &&
780	"PSInputAddr and PSInputEnable should "
781	"never both be 0 for AMDGPU_PS shaders");
782	// There are some rare circumstances where InputAddr is non-zero and
783	// InputEna can be set to 0. In this case we default to setting LastEna
784	// to 1.
785	LastEna = InputEna ? llvm::Log2_32(Value: InputEna) + `1` : `1`;
786	}
787
788	// FIXME: We should be using the number of registers determined during
789	// calling convention lowering to legalize the types.
790	const DataLayout &DL = F.getParent()->getDataLayout();
791	unsigned PSArgCount = `0`;
792	unsigned IntermediateVGPR = `0`;
793	for (auto &Arg : F.args()) {
794	unsigned NumRegs = (DL.getTypeSizeInBits(Ty: Arg.getType()) + `31`) / `32`;
795	if (Arg.hasAttribute(Attribute::Kind: InReg)) {
796	WaveDispatchNumSGPR += NumRegs;
797	} else {
798	// If this is a PS shader and we're processing the PS Input args (first
799	// 16 VGPR), use the InputEna and InputAddr bits to define how many
800	// VGPRs are actually used.
801	// Any extra VGPR arguments are handled as normal arguments (and
802	// contribute to the VGPR count whether they're used or not).
803	if (IsPixelShader && PSArgCount < `16`) {
804	if ((`1` << PSArgCount) & InputAddr) {
805	if (PSArgCount < LastEna)
806	WaveDispatchNumVGPR += NumRegs;
807	else
808	IntermediateVGPR += NumRegs;
809	}
810	PSArgCount++;
811	} else {
812	// If there are extra arguments we have to include the allocation for
813	// the non-used (but enabled with InputAddr) input arguments
814	if (IntermediateVGPR) {
815	WaveDispatchNumVGPR += IntermediateVGPR;
816	IntermediateVGPR = `0`;
817	}
818	WaveDispatchNumVGPR += NumRegs;
819	}
820	}
821	}
822	ProgInfo.NumSGPR = std::max(a: ProgInfo.NumSGPR, b: WaveDispatchNumSGPR);
823	ProgInfo.NumArchVGPR = std::max(a: ProgInfo.NumVGPR, b: WaveDispatchNumVGPR);
824	ProgInfo.NumVGPR =
825	Info.getTotalNumVGPRs(ST: STM, NumAGPR: Info.NumAGPR, NumVGPR: ProgInfo.NumArchVGPR);
826	}
827
828	// Adjust number of registers used to meet default/requested minimum/maximum
829	// number of waves per execution unit request.
830	ProgInfo.NumSGPRsForWavesPerEU = std::max(
831	a: std::max(a: ProgInfo.NumSGPR, b: `1u`), b: STM.getMinNumSGPRs(WavesPerEU: MFI->getMaxWavesPerEU()));
832	ProgInfo.NumVGPRsForWavesPerEU = std::max(
833	a: std::max(a: ProgInfo.NumVGPR, b: `1u`), b: STM.getMinNumVGPRs(WavesPerEU: MFI->getMaxWavesPerEU()));
834
835	if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS \|\|
836	STM.hasSGPRInitBug()) {
837	unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
838	if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
839	// This can happen due to a compiler bug or when using inline asm to use
840	// the registers which are usually reserved for vcc etc.
841	LLVMContext &Ctx = MF.getFunction().getContext();
842	DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",
843	ProgInfo.NumSGPR, MaxAddressableNumSGPRs,
844	DS_Error, DK_ResourceLimit);
845	Ctx.diagnose(DI: Diag);
846	ProgInfo.NumSGPR = MaxAddressableNumSGPRs;
847	ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs;
848	}
849	}
850
851	if (STM.hasSGPRInitBug()) {
852	ProgInfo.NumSGPR =
853	AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
854	ProgInfo.NumSGPRsForWavesPerEU =
855	AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
856	}
857
858	if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
859	LLVMContext &Ctx = MF.getFunction().getContext();
860	DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
861	MFI->getNumUserSGPRs(),
862	STM.getMaxNumUserSGPRs(), DS_Error);
863	Ctx.diagnose(DI: Diag);
864	}
865
866	if (MFI->getLDSSize() >
867	static_cast<unsigned>(STM.getAddressableLocalMemorySize())) {
868	LLVMContext &Ctx = MF.getFunction().getContext();
869	DiagnosticInfoResourceLimit Diag(
870	MF.getFunction(), "local memory", MFI->getLDSSize(),
871	STM.getAddressableLocalMemorySize(), DS_Error);
872	Ctx.diagnose(DI: Diag);
873	}
874
875	ProgInfo.SGPRBlocks = IsaInfo::getNumSGPRBlocks(
876	&STM, ProgInfo.NumSGPRsForWavesPerEU);
877	ProgInfo.VGPRBlocks =
878	IsaInfo::getEncodedNumVGPRBlocks(&STM, ProgInfo.NumVGPRsForWavesPerEU);
879
880	const SIModeRegisterDefaults Mode = MFI->getMode();
881
882	// Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
883	// register.
884	ProgInfo.FloatMode = getFPMode(Mode);
885
886	ProgInfo.IEEEMode = Mode.IEEE;
887
888	// Make clamp modifier on NaN input returns 0.
889	ProgInfo.DX10Clamp = Mode.DX10Clamp;
890
891	unsigned LDSAlignShift;
892	if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
893	// LDS is allocated in 64 dword blocks.
894	LDSAlignShift = `8`;
895	} else {
896	// LDS is allocated in 128 dword blocks.
897	LDSAlignShift = `9`;
898	}
899
900	ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
901	ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
902
903	ProgInfo.LDSSize = MFI->getLDSSize();
904	ProgInfo.LDSBlocks =
905	alignTo(Value: ProgInfo.LDSSize, Align: `1ULL` << LDSAlignShift) >> LDSAlignShift;
906
907	// Scratch is allocated in 64-dword or 256-dword blocks.
908	unsigned ScratchAlignShift =
909	STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? `8` : `10`;
910	// We need to program the hardware with the amount of scratch memory that
911	// is used by the entire wave. ProgInfo.ScratchSize is the amount of
912	// scratch memory used per thread.
913	ProgInfo.ScratchBlocks = divideCeil(
914	ProgInfo.ScratchSize * STM.getWavefrontSize(), `1ULL` << ScratchAlignShift);
915
916	if (getIsaVersion(GPU: getGlobalSTI()->getCPU()).Major >= `10`) {
917	ProgInfo.WgpMode = STM.isCuModeEnabled() ? `0` : `1`;
918	ProgInfo.MemOrdered = `1`;
919	}
920
921	// 0 = X, 1 = XY, 2 = XYZ
922	unsigned TIDIGCompCnt = `0`;
923	if (MFI->hasWorkItemIDZ())
924	TIDIGCompCnt = `2`;
925	else if (MFI->hasWorkItemIDY())
926	TIDIGCompCnt = `1`;
927
928	// The private segment wave byte offset is the last of the system SGPRs. We
929	// initially assumed it was allocated, and may have used it. It shouldn't harm
930	// anything to disable it if we know the stack isn't used here. We may still
931	// have emitted code reading it to initialize scratch, but if that's unused
932	// reading garbage should be OK.
933	ProgInfo.ScratchEnable =
934	ProgInfo.ScratchBlocks > `0` \|\| ProgInfo.DynamicCallStack;
935	ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
936	// For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
937	ProgInfo.TrapHandlerEnable =
938	STM.isAmdHsaOS() ? `0` : STM.isTrapHandlerEnabled();
939	ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
940	ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
941	ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
942	ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
943	ProgInfo.TIdIGCompCount = TIDIGCompCnt;
944	ProgInfo.EXCPEnMSB = `0`;
945	// For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
946	ProgInfo.LdsSize = STM.isAmdHsaOS() ? `0` : ProgInfo.LDSBlocks;
947	ProgInfo.EXCPEnable = `0`;
948
949	if (STM.hasGFX90AInsts()) {
950	AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A,
951	amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
952	ProgInfo.AccumOffset);
953	AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A,
954	amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
955	ProgInfo.TgSplit);
956	}
957
958	ProgInfo.Occupancy = STM.computeOccupancy(F: MF.getFunction(), LDSSize: ProgInfo.LDSSize,
959	NumSGPRs: ProgInfo.NumSGPRsForWavesPerEU,
960	NumVGPRs: ProgInfo.NumVGPRsForWavesPerEU);
961	const auto [MinWEU, MaxWEU] =
962	AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-waves-per-eu", Default: {`0`, `0`}, OnlyFirstRequired: true);
963	if (ProgInfo.Occupancy < MinWEU) {
964	DiagnosticInfoOptimizationFailure Diag(
965	F, F.getSubprogram(),
966	"failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
967	"'" +
968	F.getName() + "': desired occupancy was " + Twine (MinWEU) +
969	", final occupancy is " + Twine (ProgInfo.Occupancy));
970	F.getContext().diagnose(DI: Diag);
971	}
972	}
973
974	static unsigned getRsrcReg(CallingConv::ID CallConv) {
975	switch (CallConv) {
976	default: [[fallthrough]];
977	case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;
978	case CallingConv::AMDGPU_LS: return R_00B528_SPI_SHADER_PGM_RSRC1_LS;
979	case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS;
980	case CallingConv::AMDGPU_ES: return R_00B328_SPI_SHADER_PGM_RSRC1_ES;
981	case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
982	case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
983	case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
984	}
985	}
986
987	void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
988	const SIProgramInfo &CurrentProgramInfo) {
989	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
990	const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
991	unsigned RsrcReg = getRsrcReg(CallConv: MF.getFunction().getCallingConv());
992
993	if (AMDGPU::isCompute(CC: MF.getFunction().getCallingConv())) {
994	OutStreamer ->emitInt32(R_00B848_COMPUTE_PGM_RSRC1);
995
996	OutStreamer ->emitInt32(Value: CurrentProgramInfo.getComputePGMRSrc1(ST: STM));
997
998	OutStreamer ->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2);
999	OutStreamer ->emitInt32(Value: CurrentProgramInfo.getComputePGMRSrc2());
1000
1001	OutStreamer ->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
1002	OutStreamer ->emitInt32(
1003	Value: STM.getGeneration() >= AMDGPUSubtarget::GFX12
1004	? S_00B860_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks)
1005	: STM.getGeneration() == AMDGPUSubtarget::GFX11
1006	? S_00B860_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks)
1007	: S_00B860_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
1008
1009	// TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1010	// 0" comment but I don't see a corresponding field in the register spec.
1011	} else {
1012	OutStreamer ->emitInt32(Value: RsrcReg);
1013	OutStreamer ->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) \|
1014	S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), Size: `4`);
1015	OutStreamer ->emitInt32(R_0286E8_SPI_TMPRING_SIZE);
1016	OutStreamer ->emitInt32(
1017	Value: STM.getGeneration() >= AMDGPUSubtarget::GFX12
1018	? S_0286E8_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks)
1019	: STM.getGeneration() == AMDGPUSubtarget::GFX11
1020	? S_0286E8_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks)
1021	: S_0286E8_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
1022	}
1023
1024	if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1025	OutStreamer ->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS);
1026	unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1027	? divideCeil(Numerator: CurrentProgramInfo.LDSBlocks, Denominator: `2`)
1028	: CurrentProgramInfo.LDSBlocks;
1029	OutStreamer ->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1030	OutStreamer ->emitInt32(R_0286CC_SPI_PS_INPUT_ENA);
1031	OutStreamer ->emitInt32(Value: MFI->getPSInputEnable());
1032	OutStreamer ->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR);
1033	OutStreamer ->emitInt32(Value: MFI->getPSInputAddr());
1034	}
1035
1036	OutStreamer ->emitInt32(R_SPILLED_SGPRS);
1037	OutStreamer ->emitInt32(Value: MFI->getNumSpilledSGPRs());
1038	OutStreamer ->emitInt32(R_SPILLED_VGPRS);
1039	OutStreamer ->emitInt32(Value: MFI->getNumSpilledVGPRs());
1040	}
1041
1042	// Helper function to add common PAL Metadata 3.0+
1043	static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
1044	const SIProgramInfo &CurrentProgramInfo,
1045	CallingConv::ID CC, const GCNSubtarget &ST) {
1046	if (ST.hasIEEEMode())
1047	MD->setHwStage(CC, field: ".ieee_mode", Val: (bool)CurrentProgramInfo.IEEEMode);
1048
1049	MD->setHwStage(CC, field: ".wgp_mode", Val: (bool)CurrentProgramInfo.WgpMode);
1050	MD->setHwStage(CC, field: ".mem_ordered", Val: (bool)CurrentProgramInfo.MemOrdered);
1051
1052	if (AMDGPU::isCompute(CC)) {
1053	MD->setHwStage(CC, field: ".trap_present",
1054	Val: (bool)CurrentProgramInfo.TrapHandlerEnable);
1055	MD->setHwStage(CC, field: ".excp_en", Val: CurrentProgramInfo.EXCPEnable);
1056
1057	MD->setHwStage(CC, field: ".lds_size",
1058	Val: (unsigned)(CurrentProgramInfo.LdsSize *
1059	getLdsDwGranularity(ST) * sizeof(uint32_t)));
1060	}
1061	}
1062
1063	// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1064	// is AMDPAL. It stores each compute/SPI register setting and other PAL
1065	// metadata items into the PALMD::Metadata, combining with any provided by the
1066	// frontend as LLVM metadata. Once all functions are written, the PAL metadata
1067	// is then written as a single block in the .note section.
1068	void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1069	const SIProgramInfo &CurrentProgramInfo) {
1070	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1071	auto CC = MF.getFunction().getCallingConv();
1072	auto MD = getTargetStreamer()->getPALMetadata();
1073
1074	MD->setEntryPoint(CC, Name: MF.getFunction().getName());
1075	MD->setNumUsedVgprs(CC, Val: CurrentProgramInfo.NumVGPRsForWavesPerEU);
1076
1077	// Only set AGPRs for supported devices
1078	const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1079	if (STM.hasMAIInsts()) {
1080	MD->setNumUsedAgprs(CC, Val: CurrentProgramInfo.NumAccVGPR);
1081	}
1082
1083	MD->setNumUsedSgprs(CC, Val: CurrentProgramInfo.NumSGPRsForWavesPerEU);
1084	if (MD->getPALMajorVersion() < `3`) {
1085	MD->setRsrc1(CC, Val: CurrentProgramInfo.getPGMRSrc1(CC, ST: STM));
1086	if (AMDGPU::isCompute(CC)) {
1087	MD->setRsrc2(CC, Val: CurrentProgramInfo.getComputePGMRSrc2());
1088	} else {
1089	if (CurrentProgramInfo.ScratchBlocks > `0`)
1090	MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(`1`));
1091	}
1092	} else {
1093	MD->setHwStage(CC, field: ".debug_mode", Val: (bool)CurrentProgramInfo.DebugMode);
1094	MD->setHwStage(CC, field: ".scratch_en", Val: (bool)CurrentProgramInfo.ScratchEnable);
1095	EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, ST: STM);
1096	}
1097
1098	// ScratchSize is in bytes, 16 aligned.
1099	MD->setScratchSize(CC, Val: alignTo(Value: CurrentProgramInfo.ScratchSize, Align: `16`));
1100	if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1101	unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1102	? divideCeil(Numerator: CurrentProgramInfo.LDSBlocks, Denominator: `2`)
1103	: CurrentProgramInfo.LDSBlocks;
1104	if (MD->getPALMajorVersion() < `3`) {
1105	MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1106	MD->setSpiPsInputEna(MFI->getPSInputEnable());
1107	MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1108	} else {
1109	// Graphics registers
1110	const unsigned ExtraLdsDwGranularity =
1111	STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? `256` : `128`;
1112	MD->setGraphicsRegisters(
1113	field: ".ps_extra_lds_size",
1114	Val: (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
1115
1116	// Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1117	static StringLiteral const PsInputFields[] = {
1118	".persp_sample_ena", ".persp_center_ena",
1119	".persp_centroid_ena", ".persp_pull_model_ena",
1120	".linear_sample_ena", ".linear_center_ena",
1121	".linear_centroid_ena", ".line_stipple_tex_ena",
1122	".pos_x_float_ena", ".pos_y_float_ena",
1123	".pos_z_float_ena", ".pos_w_float_ena",
1124	".front_face_ena", ".ancillary_ena",
1125	".sample_coverage_ena", ".pos_fixed_pt_ena"};
1126	unsigned PSInputEna = MFI->getPSInputEnable();
1127	unsigned PSInputAddr = MFI->getPSInputAddr();
1128	for (auto [Idx, Field] : enumerate(First: PsInputFields)) {
1129	MD->setGraphicsRegisters(field1: ".spi_ps_input_ena", field2: Field,
1130	Val: (bool)((PSInputEna >> Idx) & `1`));
1131	MD->setGraphicsRegisters(field1: ".spi_ps_input_addr", field2: Field,
1132	Val: (bool)((PSInputAddr >> Idx) & `1`));
1133	}
1134	}
1135	}
1136
1137	// For version 3 and above the wave front size is already set in the metadata
1138	if (MD->getPALMajorVersion() < `3` && STM.isWave32())
1139	MD->setWave32(MF.getFunction().getCallingConv());
1140	}
1141
1142	void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1143	auto *MD = getTargetStreamer()->getPALMetadata();
1144	const MachineFrameInfo &MFI = MF.getFrameInfo();
1145	StringRef FnName = MF.getFunction().getName();
1146	MD->setFunctionScratchSize(FnName, Val: MFI.getStackSize());
1147	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1148
1149	if (MD->getPALMajorVersion() < `3`) {
1150	// Set compute registers
1151	MD->setRsrc1(CC: CallingConv::AMDGPU_CS,
1152	Val: CurrentProgramInfo.getPGMRSrc1(CC: CallingConv::AMDGPU_CS, ST));
1153	MD->setRsrc2(CC: CallingConv::AMDGPU_CS,
1154	Val: CurrentProgramInfo.getComputePGMRSrc2());
1155	} else {
1156	EmitPALMetadataCommon(MD, CurrentProgramInfo, CC: CallingConv::AMDGPU_CS, ST);
1157	}
1158
1159	// Set optional info
1160	MD->setFunctionLdsSize(FnName, Val: CurrentProgramInfo.LDSSize);
1161	MD->setFunctionNumUsedVgprs(FnName, Val: CurrentProgramInfo.NumVGPRsForWavesPerEU);
1162	MD->setFunctionNumUsedSgprs(FnName, Val: CurrentProgramInfo.NumSGPRsForWavesPerEU);
1163	}
1164
1165	// This is supposed to be log2(Size)
1166	static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
1167	switch (Size) {
1168	case `4`:
1169	return AMD_ELEMENT_4_BYTES;
1170	case `8`:
1171	return AMD_ELEMENT_8_BYTES;
1172	case `16`:
1173	return AMD_ELEMENT_16_BYTES;
1174	default:
1175	llvm_unreachable("invalid private_element_size");
1176	}
1177	}
1178
1179	void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
1180	const SIProgramInfo &CurrentProgramInfo,
1181	const MachineFunction &MF) const {
1182	const Function &F = MF.getFunction();
1183	assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL \|\|
1184	F.getCallingConv() == CallingConv::SPIR_KERNEL);
1185
1186	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1187	const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1188
1189	AMDGPU::initDefaultAMDKernelCodeT(Out, &STM);
1190
1191	Out.compute_pgm_resource_registers =
1192	CurrentProgramInfo.getComputePGMRSrc1(ST: STM) \|
1193	(CurrentProgramInfo.getComputePGMRSrc2() << `32`);
1194	Out.code_properties \|= AMD_CODE_PROPERTY_IS_PTR64;
1195
1196	if (CurrentProgramInfo.DynamicCallStack)
1197	Out.code_properties \|= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK;
1198
1199	AMD_HSA_BITS_SET(Out.code_properties,
1200	AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
1201	getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1202
1203	const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
1204	if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
1205	Out.code_properties \|=
1206	AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
1207	}
1208
1209	if (UserSGPRInfo.hasDispatchPtr())
1210	Out.code_properties \|= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
1211
1212	if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5)
1213	Out.code_properties \|= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
1214
1215	if (UserSGPRInfo.hasKernargSegmentPtr())
1216	Out.code_properties \|= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
1217
1218	if (UserSGPRInfo.hasDispatchID())
1219	Out.code_properties \|= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
1220
1221	if (UserSGPRInfo.hasFlatScratchInit())
1222	Out.code_properties \|= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
1223
1224	if (UserSGPRInfo.hasDispatchPtr())
1225	Out.code_properties \|= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
1226
1227	if (STM.isXNACKEnabled())
1228	Out.code_properties \|= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
1229
1230	Align MaxKernArgAlign;
1231	Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1232	Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1233	Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1234	Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1235	Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1236
1237	// kernarg_segment_alignment is specified as log of the alignment.
1238	// The minimum alignment is 16.
1239	// FIXME: The metadata treats the minimum as 4?
1240	Out.kernarg_segment_alignment = Log2(A: std::max(a: Align (`16`), b: MaxKernArgAlign));
1241	}
1242
1243	bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr MI, unsigned* OpNo,
1244	const char *ExtraCode, raw_ostream &O) {
1245	// First try the generic code, which knows about modifiers like 'c' and 'n'.
1246	if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, OS&: O))
1247	return false;
1248
1249	if (ExtraCode && ExtraCode[`0`]) {
1250	if (ExtraCode[`1`] != `0`)
1251	return true; // Unknown modifier.
1252
1253	switch (ExtraCode[`0`]) {
1254	case `'r'`:
1255	break;
1256	default:
1257	return true;
1258	}
1259	}
1260
1261	// TODO: Should be able to support other operand types like globals.
1262	const MachineOperand &MO = MI->getOperand(i: OpNo);
1263	if (MO.isReg()) {
1264	AMDGPUInstPrinter::printRegOperand(RegNo: MO.getReg(), O,
1265	MRI: *MF->getSubtarget().getRegisterInfo());
1266	return false;
1267	} else if (MO.isImm()) {
1268	int64_t Val = MO.getImm();
1269	if (AMDGPU::isInlinableIntLiteral(Literal: Val)) {
1270	O << Val;
1271	} else if (isUInt<`16`>(x: Val)) {
1272	O << format(Fmt: "0x%" PRIx16, Vals: static_cast<uint16_t>(Val));
1273	} else if (isUInt<`32`>(x: Val)) {
1274	O << format(Fmt: "0x%" PRIx32, Vals: static_cast<uint32_t>(Val));
1275	} else {
1276	O << format(Fmt: "0x%" PRIx64, Vals: static_cast<uint64_t>(Val));
1277	}
1278	return false;
1279	}
1280	return true;
1281	}
1282
1283	void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
1284	AU.addRequired<AMDGPUResourceUsageAnalysis>();
1285	AU.addPreserved<AMDGPUResourceUsageAnalysis>();
1286	AsmPrinter::getAnalysisUsage(AU);
1287	}
1288
1289	void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1290	const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1291	bool isModuleEntryFunction, bool hasMAIInsts) {
1292	if (!ORE)
1293	return;
1294
1295	const char *Name = "kernel-resource-usage";
1296	const char *Indent = " ";
1297
1298	// If the remark is not specifically enabled, do not output to yaml
1299	LLVMContext &Ctx = MF.getFunction().getContext();
1300	if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(PassName: Name))
1301	return;
1302
1303	auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1304	StringRef RemarkLabel, auto Argument) {
1305	// Add an indent for every line besides the line with the kernel name. This
1306	// makes it easier to tell which resource usage go with which kernel since
1307	// the kernel name will always be displayed first.
1308	std::string LabelStr = RemarkLabel.str() + ": ";
1309	if (!RemarkName.equals(RHS: "FunctionName"))
1310	LabelStr = Indent + LabelStr;
1311
1312	ORE->emit([&]() {
1313	return MachineOptimizationRemarkAnalysis (Name, RemarkName,
1314	MF.getFunction().getSubprogram(),
1315	&MF.front())
1316	<< LabelStr << ore::NV(RemarkName, Argument);
1317	});
1318	};
1319
1320	// FIXME: Formatting here is pretty nasty because clang does not accept
1321	// newlines from diagnostics. This forces us to emit multiple diagnostic
1322	// remarks to simulate newlines. If and when clang does accept newlines, this
1323	// formatting should be aggregated into one remark with newlines to avoid
1324	// printing multiple diagnostic location and diag opts.
1325	EmitResourceUsageRemark ("FunctionName", "Function Name",
1326	MF.getFunction().getName());
1327	EmitResourceUsageRemark ("NumSGPR", "SGPRs", CurrentProgramInfo.NumSGPR);
1328	EmitResourceUsageRemark ("NumVGPR", "VGPRs", CurrentProgramInfo.NumArchVGPR);
1329	if (hasMAIInsts)
1330	EmitResourceUsageRemark ("NumAGPR", "AGPRs", CurrentProgramInfo.NumAccVGPR);
1331	EmitResourceUsageRemark ("ScratchSize", "ScratchSize [bytes/lane]",
1332	CurrentProgramInfo.ScratchSize);
1333	StringRef DynamicStackStr =
1334	CurrentProgramInfo.DynamicCallStack ? "True" : "False";
1335	EmitResourceUsageRemark ("DynamicStack", "Dynamic Stack", DynamicStackStr);
1336	EmitResourceUsageRemark ("Occupancy", "Occupancy [waves/SIMD]",
1337	CurrentProgramInfo.Occupancy);
1338	EmitResourceUsageRemark ("SGPRSpill", "SGPRs Spill",
1339	CurrentProgramInfo.SGPRSpill);
1340	EmitResourceUsageRemark ("VGPRSpill", "VGPRs Spill",
1341	CurrentProgramInfo.VGPRSpill);
1342	if (isModuleEntryFunction)
1343	EmitResourceUsageRemark ("BytesLDS", "LDS Size [bytes/block]",
1344	CurrentProgramInfo.LDSSize);
1345	}
1346

source code of llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp