AMDGPUSubtarget.cpp source code [llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp]

1	//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// Implements the AMDGPU specific subclass of TargetSubtarget.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "AMDGPUSubtarget.h"
15	#include "AMDGPUCallLowering.h"
16	#include "AMDGPUInstructionSelector.h"
17	#include "AMDGPULegalizerInfo.h"
18	#include "AMDGPURegisterBankInfo.h"
19	#include "AMDGPUTargetMachine.h"
20	#include "GCNSubtarget.h"
21	#include "R600Subtarget.h"
22	#include "SIMachineFunctionInfo.h"
23	#include "Utils/AMDGPUBaseInfo.h"
24	#include "llvm/ADT/SmallString.h"
25	#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
26	#include "llvm/CodeGen/MachineScheduler.h"
27	#include "llvm/CodeGen/TargetFrameLowering.h"
28	#include "llvm/IR/IntrinsicsAMDGPU.h"
29	#include "llvm/IR/IntrinsicsR600.h"
30	#include "llvm/IR/MDBuilder.h"
31	#include "llvm/MC/MCSubtargetInfo.h"
32	#include <algorithm>
33
34	using namespace llvm;
35
36	#define DEBUG_TYPE "amdgpu-subtarget"
37
38	#define GET_SUBTARGETINFO_TARGET_DESC
39	#define GET_SUBTARGETINFO_CTOR
40	#define AMDGPUSubtarget GCNSubtarget
41	#include "AMDGPUGenSubtargetInfo.inc"
42	#undef AMDGPUSubtarget
43
44	static cl::opt<bool> EnablePowerSched(
45	"amdgpu-enable-power-sched",
46	cl::desc ("Enable scheduling to minimize mAI power bursts"),
47	cl::init(false));
48
49	static cl::opt<bool> EnableVGPRIndexMode(
50	"amdgpu-vgpr-index-mode",
51	cl::desc ("Use GPR indexing mode instead of movrel for vector indexing"),
52	cl::init(Val: false));
53
54	static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
55	cl::desc ("Enable the use of AA during codegen."),
56	cl::init(Val: true));
57
58	static cl::opt<unsigned> NSAThreshold("amdgpu-nsa-threshold",
59	cl::desc ("Number of addresses from which to enable MIMG NSA."),
60	cl::init(Val: `3`), cl::Hidden);
61
62	GCNSubtarget::~GCNSubtarget() = default;
63
64	GCNSubtarget &
65	GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
66	StringRef GPU, StringRef FS) {
67	// Determine default and user-specified characteristics
68	//
69	// We want to be able to turn these off, but making this a subtarget feature
70	// for SI has the unhelpful behavior that it unsets everything else if you
71	// disable it.
72	//
73	// Similarly we want enable-prt-strict-null to be on by default and not to
74	// unset everything else if it is disabled
75
76	SmallString<`256`> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
77
78	// Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
79	if (isAmdHsaOS())
80	FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
81
82	FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
83
84	// Disable mutually exclusive bits.
85	if (FS.contains_insensitive(Other: "+wavefrontsize")) {
86	if (!FS.contains_insensitive(Other: "wavefrontsize16"))
87	FullFS += "-wavefrontsize16,";
88	if (!FS.contains_insensitive(Other: "wavefrontsize32"))
89	FullFS += "-wavefrontsize32,";
90	if (!FS.contains_insensitive(Other: "wavefrontsize64"))
91	FullFS += "-wavefrontsize64,";
92	}
93
94	FullFS += FS;
95
96	ParseSubtargetFeatures(CPU: GPU, /TuneCPU/ GPU, FS: FullFS);
97
98	// Implement the "generic" processors, which acts as the default when no
99	// generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
100	// the first amdgcn target that supports flat addressing. Other OSes defaults
101	// to the first amdgcn target.
102	if (Gen == AMDGPUSubtarget::INVALID) {
103	Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
104	: AMDGPUSubtarget::SOUTHERN_ISLANDS;
105	}
106
107	// We don't support FP64 for EG/NI atm.
108	assert(!hasFP64() \|\| (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
109
110	// Targets must either support 64-bit offsets for MUBUF instructions, and/or
111	// support flat operations, otherwise they cannot access a 64-bit global
112	// address space
113	assert(hasAddr64() \|\| hasFlat());
114	// Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
115	// that do not support ADDR64 variants of MUBUF instructions. Such targets
116	// cannot use a 64 bit offset with a MUBUF instruction to access the global
117	// address space
118	if (!hasAddr64() && !FS.contains(Other: "flat-for-global") && !FlatForGlobal) {
119	ToggleFeature(AMDGPU::FeatureFlatForGlobal);
120	FlatForGlobal = true;
121	}
122	// Unless +-flat-for-global is specified, use MUBUF instructions for global
123	// address space access if flat operations are not available.
124	if (!hasFlat() && !FS.contains(Other: "flat-for-global") && FlatForGlobal) {
125	ToggleFeature(AMDGPU::FeatureFlatForGlobal);
126	FlatForGlobal = false;
127	}
128
129	// Set defaults if needed.
130	if (MaxPrivateElementSize == `0`)
131	MaxPrivateElementSize = `4`;
132
133	if (LDSBankCount == `0`)
134	LDSBankCount = `32`;
135
136	if (TT.getArch() == Triple::amdgcn) {
137	if (LocalMemorySize == `0`)
138	LocalMemorySize = `32768`;
139
140	// Do something sensible for unspecified target.
141	if (!HasMovrel && !HasVGPRIndexMode)
142	HasMovrel = true;
143	}
144
145	AddressableLocalMemorySize = LocalMemorySize;
146
147	if (AMDGPU::isGFX10Plus(*this) &&
148	!getFeatureBits().test(AMDGPU::FeatureCuMode))
149	LocalMemorySize *= `2`;
150
151	// Don't crash on invalid devices.
152	if (WavefrontSizeLog2 == `0`)
153	WavefrontSizeLog2 = `5`;
154
155	HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
156	HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
157
158	TargetID.setTargetIDFromFeaturesString(FS);
159
160	LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
161	<< TargetID.getXnackSetting() << `'\n'`);
162	LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
163	<< TargetID.getSramEccSetting() << `'\n'`);
164
165	return *this;
166	}
167
168	AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : TargetTriple (TT) {}
169
170	bool AMDGPUSubtarget::useRealTrue16Insts() const {
171	return hasTrue16BitInsts() && EnableRealTrue16Insts;
172	}
173
174	GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
175	const GCNTargetMachine &TM)
176	: // clang-format off
177	AMDGPUGenSubtargetInfo(TT, GPU, /TuneCPU/ GPU, FS),
178	AMDGPUSubtarget(TT),
179	TargetTriple(TT),
180	TargetID(*this),
181	InstrItins(getInstrItineraryForCPU(GPU)),
182	InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
183	TLInfo(TM, *this),
184	FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), `0`) {
185	// clang-format on
186	MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
187	EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(this);
188	CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
189	InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
190	Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
191	RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
192	InstSelector.reset(new AMDGPUInstructionSelector(
193	*this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
194	}
195
196	unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
197	if (getGeneration() < GFX10)
198	return `1`;
199
200	switch (Opcode) {
201	case AMDGPU::V_LSHLREV_B64_e64:
202	case AMDGPU::V_LSHLREV_B64_gfx10:
203	case AMDGPU::V_LSHLREV_B64_e64_gfx11:
204	case AMDGPU::V_LSHLREV_B64_e32_gfx12:
205	case AMDGPU::V_LSHLREV_B64_e64_gfx12:
206	case AMDGPU::V_LSHL_B64_e64:
207	case AMDGPU::V_LSHRREV_B64_e64:
208	case AMDGPU::V_LSHRREV_B64_gfx10:
209	case AMDGPU::V_LSHRREV_B64_e64_gfx11:
210	case AMDGPU::V_LSHRREV_B64_e64_gfx12:
211	case AMDGPU::V_LSHR_B64_e64:
212	case AMDGPU::V_ASHRREV_I64_e64:
213	case AMDGPU::V_ASHRREV_I64_gfx10:
214	case AMDGPU::V_ASHRREV_I64_e64_gfx11:
215	case AMDGPU::V_ASHRREV_I64_e64_gfx12:
216	case AMDGPU::V_ASHR_I64_e64:
217	return `1`;
218	}
219
220	return `2`;
221	}
222
223	/// This list was mostly derived from experimentation.
224	bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
225	switch (Opcode) {
226	case AMDGPU::V_CVT_F16_F32_e32:
227	case AMDGPU::V_CVT_F16_F32_e64:
228	case AMDGPU::V_CVT_F16_U16_e32:
229	case AMDGPU::V_CVT_F16_U16_e64:
230	case AMDGPU::V_CVT_F16_I16_e32:
231	case AMDGPU::V_CVT_F16_I16_e64:
232	case AMDGPU::V_RCP_F16_e64:
233	case AMDGPU::V_RCP_F16_e32:
234	case AMDGPU::V_RSQ_F16_e64:
235	case AMDGPU::V_RSQ_F16_e32:
236	case AMDGPU::V_SQRT_F16_e64:
237	case AMDGPU::V_SQRT_F16_e32:
238	case AMDGPU::V_LOG_F16_e64:
239	case AMDGPU::V_LOG_F16_e32:
240	case AMDGPU::V_EXP_F16_e64:
241	case AMDGPU::V_EXP_F16_e32:
242	case AMDGPU::V_SIN_F16_e64:
243	case AMDGPU::V_SIN_F16_e32:
244	case AMDGPU::V_COS_F16_e64:
245	case AMDGPU::V_COS_F16_e32:
246	case AMDGPU::V_FLOOR_F16_e64:
247	case AMDGPU::V_FLOOR_F16_e32:
248	case AMDGPU::V_CEIL_F16_e64:
249	case AMDGPU::V_CEIL_F16_e32:
250	case AMDGPU::V_TRUNC_F16_e64:
251	case AMDGPU::V_TRUNC_F16_e32:
252	case AMDGPU::V_RNDNE_F16_e64:
253	case AMDGPU::V_RNDNE_F16_e32:
254	case AMDGPU::V_FRACT_F16_e64:
255	case AMDGPU::V_FRACT_F16_e32:
256	case AMDGPU::V_FREXP_MANT_F16_e64:
257	case AMDGPU::V_FREXP_MANT_F16_e32:
258	case AMDGPU::V_FREXP_EXP_I16_F16_e64:
259	case AMDGPU::V_FREXP_EXP_I16_F16_e32:
260	case AMDGPU::V_LDEXP_F16_e64:
261	case AMDGPU::V_LDEXP_F16_e32:
262	case AMDGPU::V_LSHLREV_B16_e64:
263	case AMDGPU::V_LSHLREV_B16_e32:
264	case AMDGPU::V_LSHRREV_B16_e64:
265	case AMDGPU::V_LSHRREV_B16_e32:
266	case AMDGPU::V_ASHRREV_I16_e64:
267	case AMDGPU::V_ASHRREV_I16_e32:
268	case AMDGPU::V_ADD_U16_e64:
269	case AMDGPU::V_ADD_U16_e32:
270	case AMDGPU::V_SUB_U16_e64:
271	case AMDGPU::V_SUB_U16_e32:
272	case AMDGPU::V_SUBREV_U16_e64:
273	case AMDGPU::V_SUBREV_U16_e32:
274	case AMDGPU::V_MUL_LO_U16_e64:
275	case AMDGPU::V_MUL_LO_U16_e32:
276	case AMDGPU::V_ADD_F16_e64:
277	case AMDGPU::V_ADD_F16_e32:
278	case AMDGPU::V_SUB_F16_e64:
279	case AMDGPU::V_SUB_F16_e32:
280	case AMDGPU::V_SUBREV_F16_e64:
281	case AMDGPU::V_SUBREV_F16_e32:
282	case AMDGPU::V_MUL_F16_e64:
283	case AMDGPU::V_MUL_F16_e32:
284	case AMDGPU::V_MAX_F16_e64:
285	case AMDGPU::V_MAX_F16_e32:
286	case AMDGPU::V_MIN_F16_e64:
287	case AMDGPU::V_MIN_F16_e32:
288	case AMDGPU::V_MAX_U16_e64:
289	case AMDGPU::V_MAX_U16_e32:
290	case AMDGPU::V_MIN_U16_e64:
291	case AMDGPU::V_MIN_U16_e32:
292	case AMDGPU::V_MAX_I16_e64:
293	case AMDGPU::V_MAX_I16_e32:
294	case AMDGPU::V_MIN_I16_e64:
295	case AMDGPU::V_MIN_I16_e32:
296	case AMDGPU::V_MAD_F16_e64:
297	case AMDGPU::V_MAD_U16_e64:
298	case AMDGPU::V_MAD_I16_e64:
299	case AMDGPU::V_FMA_F16_e64:
300	case AMDGPU::V_DIV_FIXUP_F16_e64:
301	// On gfx10, all 16-bit instructions preserve the high bits.
302	return getGeneration() <= AMDGPUSubtarget::GFX9;
303	case AMDGPU::V_MADAK_F16:
304	case AMDGPU::V_MADMK_F16:
305	case AMDGPU::V_MAC_F16_e64:
306	case AMDGPU::V_MAC_F16_e32:
307	case AMDGPU::V_FMAMK_F16:
308	case AMDGPU::V_FMAAK_F16:
309	case AMDGPU::V_FMAC_F16_e64:
310	case AMDGPU::V_FMAC_F16_e32:
311	// In gfx9, the preferred handling of the unused high 16-bits changed. Most
312	// instructions maintain the legacy behavior of 0ing. Some instructions
313	// changed to preserving the high bits.
314	return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
315	case AMDGPU::V_MAD_MIXLO_F16:
316	case AMDGPU::V_MAD_MIXHI_F16:
317	default:
318	return false;
319	}
320	}
321
322	// Returns the maximum per-workgroup LDS allocation size (in bytes) that still
323	// allows the given function to achieve an occupancy of NWaves waves per
324	// SIMD / EU, taking into account only the function's maximum* workgroup size.*
325	unsigned
326	AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
327	const Function &F) const {
328	const unsigned WaveSize = getWavefrontSize();
329	const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
330	const unsigned WavesPerWorkgroup =
331	std::max(a: `1u`, b: (WorkGroupSize + WaveSize - `1`) / WaveSize);
332
333	const unsigned WorkGroupsPerCU =
334	std::max(a: `1u`, b: (NWaves * getEUsPerCU()) / WavesPerWorkgroup);
335
336	return getLocalMemorySize() / WorkGroupsPerCU;
337	}
338
339	// FIXME: Should return min,max range.
340	//
341	// Returns the maximum occupancy, in number of waves per SIMD / EU, that can
342	// be achieved when only the given function is running on the machine; and
343	// taking into account the overall number of wave slots, the (maximum) workgroup
344	// size, and the per-workgroup LDS allocation size.
345	unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
346	const Function &F) const {
347	const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
348	const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(FlatWorkGroupSize: MaxWorkGroupSize);
349	if (!MaxWorkGroupsPerCu)
350	return `0`;
351
352	const unsigned WaveSize = getWavefrontSize();
353
354	// FIXME: Do we need to account for alignment requirement of LDS rounding the
355	// size up?
356	// Compute restriction based on LDS usage
357	unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : `1u`);
358
359	// This can be queried with more LDS than is possible, so just assume the
360	// worst.
361	if (NumGroups == `0`)
362	return `1`;
363
364	NumGroups = std::min(a: MaxWorkGroupsPerCu, b: NumGroups);
365
366	// Round to the number of waves per CU.
367	const unsigned MaxGroupNumWaves = divideCeil(Numerator: MaxWorkGroupSize, Denominator: WaveSize);
368	unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
369
370	// Number of waves per EU (SIMD).
371	MaxWaves = divideCeil(Numerator: MaxWaves, Denominator: getEUsPerCU());
372
373	// Clamp to the maximum possible number of waves.
374	MaxWaves = std::min(a: MaxWaves, b: getMaxWavesPerEU());
375
376	// FIXME: Needs to be a multiple of the group size?
377	//MaxWaves = MaxGroupNumWaves (MaxWaves / MaxGroupNumWaves);*
378
379	assert(MaxWaves > `0` && MaxWaves <= getMaxWavesPerEU() &&
380	"computed invalid occupancy");
381	return MaxWaves;
382	}
383
384	unsigned
385	AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
386	const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
387	return getOccupancyWithLocalMemSize(Bytes: MFI->getLDSSize(), F: MF.getFunction());
388	}
389
390	std::pair<unsigned, unsigned>
391	AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
392	switch (CC) {
393	case CallingConv::AMDGPU_VS:
394	case CallingConv::AMDGPU_LS:
395	case CallingConv::AMDGPU_HS:
396	case CallingConv::AMDGPU_ES:
397	case CallingConv::AMDGPU_GS:
398	case CallingConv::AMDGPU_PS:
399	return std::pair(`1`, getWavefrontSize());
400	default:
401	return std::pair(`1u`, getMaxFlatWorkGroupSize());
402	}
403	}
404
405	std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
406	const Function &F) const {
407	// Default minimum/maximum flat work group sizes.
408	std::pair<unsigned, unsigned> Default =
409	getDefaultFlatWorkGroupSize(CC: F.getCallingConv());
410
411	// Requested minimum/maximum flat work group sizes.
412	std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
413	F, Name: "amdgpu-flat-work-group-size", Default);
414
415	// Make sure requested minimum is less than requested maximum.
416	if (Requested.first > Requested.second)
417	return Default;
418
419	// Make sure requested values do not violate subtarget's specifications.
420	if (Requested.first < getMinFlatWorkGroupSize())
421	return Default;
422	if (Requested.second > getMaxFlatWorkGroupSize())
423	return Default;
424
425	return Requested;
426	}
427
428	std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
429	std::pair<unsigned, unsigned> Requested,
430	std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
431	// Default minimum/maximum number of waves per execution unit.
432	std::pair<unsigned, unsigned> Default(`1`, getMaxWavesPerEU());
433
434	// If minimum/maximum flat work group sizes were explicitly requested using
435	// "amdgpu-flat-workgroup-size" attribute, then set default minimum/maximum
436	// number of waves per execution unit to values implied by requested
437	// minimum/maximum flat work group sizes.
438	unsigned MinImpliedByFlatWorkGroupSize =
439	getWavesPerEUForWorkGroup(FlatWorkGroupSize: FlatWorkGroupSizes.second);
440	Default.first = MinImpliedByFlatWorkGroupSize;
441
442	// Make sure requested minimum is less than requested maximum.
443	if (Requested.second && Requested.first > Requested.second)
444	return Default;
445
446	// Make sure requested values do not violate subtarget's specifications.
447	if (Requested.first < getMinWavesPerEU() \|\|
448	Requested.second > getMaxWavesPerEU())
449	return Default;
450
451	// Make sure requested values are compatible with values implied by requested
452	// minimum/maximum flat work group sizes.
453	if (Requested.first < MinImpliedByFlatWorkGroupSize)
454	return Default;
455
456	return Requested;
457	}
458
459	std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
460	const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
461	// Default minimum/maximum number of waves per execution unit.
462	std::pair<unsigned, unsigned> Default(`1`, getMaxWavesPerEU());
463
464	// Requested minimum/maximum number of waves per execution unit.
465	std::pair<unsigned, unsigned> Requested =
466	AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-waves-per-eu", Default, OnlyFirstRequired: true);
467	return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes);
468	}
469
470	static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
471	auto Node = Kernel.getMetadata(Kind: "reqd_work_group_size");
472	if (Node && Node->getNumOperands() == `3`)
473	return mdconst::extract<ConstantInt>(MD: Node->getOperand(I: Dim))->getZExtValue();
474	return std::numeric_limits<unsigned>::max();
475	}
476
477	bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
478	return isMesa3DOS() && !AMDGPU::isShader(CC: F.getCallingConv());
479	}
480
481	unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
482	unsigned Dimension) const {
483	unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dim: Dimension);
484	if (ReqdSize != std::numeric_limits<unsigned>::max())
485	return ReqdSize - `1`;
486	return getFlatWorkGroupSizes(F: Kernel).second - `1`;
487	}
488
489	bool AMDGPUSubtarget::isSingleLaneExecution(const Function &Func) const {
490	for (int I = `0`; I < `3`; ++I) {
491	if (getMaxWorkitemID(Kernel: Func, Dimension: I) > `0`)
492	return false;
493	}
494
495	return true;
496	}
497
498	bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction I) const* {
499	Function *Kernel = I->getParent()->getParent();
500	unsigned MinSize = `0`;
501	unsigned MaxSize = getFlatWorkGroupSizes(F: *Kernel).second;
502	bool IdQuery = false;
503
504	// If reqd_work_group_size is present it narrows value down.
505	if (auto *CI = dyn_cast<CallInst>(Val: I)) {
506	const Function *F = CI->getCalledFunction();
507	if (F) {
508	unsigned Dim = UINT_MAX;
509	switch (F->getIntrinsicID()) {
510	case Intrinsic::amdgcn_workitem_id_x:
511	case Intrinsic::r600_read_tidig_x:
512	IdQuery = true;
513	[[fallthrough]];
514	case Intrinsic::r600_read_local_size_x:
515	Dim = `0`;
516	break;
517	case Intrinsic::amdgcn_workitem_id_y:
518	case Intrinsic::r600_read_tidig_y:
519	IdQuery = true;
520	[[fallthrough]];
521	case Intrinsic::r600_read_local_size_y:
522	Dim = `1`;
523	break;
524	case Intrinsic::amdgcn_workitem_id_z:
525	case Intrinsic::r600_read_tidig_z:
526	IdQuery = true;
527	[[fallthrough]];
528	case Intrinsic::r600_read_local_size_z:
529	Dim = `2`;
530	break;
531	default:
532	break;
533	}
534
535	if (Dim <= `3`) {
536	unsigned ReqdSize = getReqdWorkGroupSize(Kernel: *Kernel, Dim);
537	if (ReqdSize != std::numeric_limits<unsigned>::max())
538	MinSize = MaxSize = ReqdSize;
539	}
540	}
541	}
542
543	if (!MaxSize)
544	return false;
545
546	// Range metadata is [Lo, Hi). For ID query we need to pass max size
547	// as Hi. For size query we need to pass Hi + 1.
548	if (IdQuery)
549	MinSize = `0`;
550	else
551	++MaxSize;
552
553	MDBuilder MDB(I->getContext());
554	MDNode *MaxWorkGroupSizeRange = MDB.createRange(Lo: APInt (`32`, MinSize),
555	Hi: APInt (`32`, MaxSize));
556	I->setMetadata(KindID: LLVMContext::MD_range, Node: MaxWorkGroupSizeRange);
557	return true;
558	}
559
560	unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
561	assert(AMDGPU::isKernel(F.getCallingConv()));
562
563	// We don't allocate the segment if we know the implicit arguments weren't
564	// used, even if the ABI implies we need them.
565	if (F.hasFnAttribute(Kind: "amdgpu-no-implicitarg-ptr"))
566	return `0`;
567
568	if (isMesaKernel(F))
569	return `16`;
570
571	// Assume all implicit inputs are used by default
572	const Module *M = F.getParent();
573	unsigned NBytes =
574	AMDGPU::getAMDHSACodeObjectVersion(M: *M) >= AMDGPU::AMDHSA_COV5 ? `256` : `56`;
575	return F.getFnAttributeAsParsedInteger(Kind: "amdgpu-implicitarg-num-bytes",
576	Default: NBytes);
577	}
578
579	uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
580	Align &MaxAlign) const {
581	assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL \|\|
582	F.getCallingConv() == CallingConv::SPIR_KERNEL);
583
584	const DataLayout &DL = F.getParent()->getDataLayout();
585	uint64_t ExplicitArgBytes = `0`;
586	MaxAlign = Align (`1`);
587
588	for (const Argument &Arg : F.args()) {
589	const bool IsByRef = Arg.hasByRefAttr();
590	Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
591	Align Alignment = DL.getValueOrABITypeAlignment(
592	Alignment: IsByRef ? Arg.getParamAlign() : std::nullopt, Ty: ArgTy);
593	uint64_t AllocSize = DL.getTypeAllocSize(Ty: ArgTy);
594	ExplicitArgBytes = alignTo(Size: ExplicitArgBytes, A: Alignment) + AllocSize;
595	MaxAlign = std::max(a: MaxAlign, b: Alignment);
596	}
597
598	return ExplicitArgBytes;
599	}
600
601	unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
602	Align &MaxAlign) const {
603	if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL &&
604	F.getCallingConv() != CallingConv::SPIR_KERNEL)
605	return `0`;
606
607	uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
608
609	unsigned ExplicitOffset = getExplicitKernelArgOffset();
610
611	uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
612	unsigned ImplicitBytes = getImplicitArgNumBytes(F);
613	if (ImplicitBytes != `0`) {
614	const Align Alignment = getAlignmentForImplicitArgPtr();
615	TotalSize = alignTo(Size: ExplicitArgBytes, A: Alignment) + ImplicitBytes;
616	MaxAlign = std::max(a: MaxAlign, b: Alignment);
617	}
618
619	// Being able to dereference past the end is useful for emitting scalar loads.
620	return alignTo(Value: TotalSize, Align: `4`);
621	}
622
623	AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
624	return getWavefrontSize() == `32` ? AMDGPUDwarfFlavour::Wave32
625	: AMDGPUDwarfFlavour::Wave64;
626	}
627
628	void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
629	unsigned NumRegionInstrs) const {
630	// Track register pressure so the scheduler can try to decrease
631	// pressure once register usage is above the threshold defined by
632	// SIRegisterInfo::getRegPressureSetLimit()
633	Policy.ShouldTrackPressure = true;
634
635	// Enabling both top down and bottom up scheduling seems to give us less
636	// register spills than just using one of these approaches on its own.
637	Policy.OnlyTopDown = false;
638	Policy.OnlyBottomUp = false;
639
640	// Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
641	if (!enableSIScheduler())
642	Policy.ShouldTrackLaneMasks = true;
643	}
644
645	void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const {
646	if (isWave32()) {
647	// Fix implicit $vcc operands after MIParser has verified that they match
648	// the instruction definitions.
649	for (auto &MBB : MF) {
650	for (auto &MI : MBB)
651	InstrInfo.fixImplicitOperands(MI);
652	}
653	}
654	}
655
656	bool GCNSubtarget::hasMadF16() const {
657	return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -`1`;
658	}
659
660	bool GCNSubtarget::useVGPRIndexMode() const {
661	return !hasMovrel() \|\| (EnableVGPRIndexMode && hasVGPRIndexMode());
662	}
663
664	bool GCNSubtarget::useAA() const { return UseAA; }
665
666	unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
667	if (getGeneration() >= AMDGPUSubtarget::GFX10)
668	return getMaxWavesPerEU();
669
670	if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
671	if (SGPRs <= `80`)
672	return `10`;
673	if (SGPRs <= `88`)
674	return `9`;
675	if (SGPRs <= `100`)
676	return `8`;
677	return `7`;
678	}
679	if (SGPRs <= `48`)
680	return `10`;
681	if (SGPRs <= `56`)
682	return `9`;
683	if (SGPRs <= `64`)
684	return `8`;
685	if (SGPRs <= `72`)
686	return `7`;
687	if (SGPRs <= `80`)
688	return `6`;
689	return `5`;
690	}
691
692	unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const {
693	return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs);
694	}
695
696	unsigned
697	GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
698	if (getGeneration() >= AMDGPUSubtarget::GFX10)
699	return `2`; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
700
701	if (HasFlatScratch \|\| HasArchitectedFlatScratch) {
702	if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
703	return `6`; // FLAT_SCRATCH, XNACK, VCC (in that order).
704	if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
705	return `4`; // FLAT_SCRATCH, VCC (in that order).
706	}
707
708	if (isXNACKEnabled())
709	return `4`; // XNACK, VCC (in that order).
710	return `2`; // VCC.
711	}
712
713	unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
714	const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
715	return getBaseReservedNumSGPRs(HasFlatScratch: MFI.getUserSGPRInfo().hasFlatScratchInit());
716	}
717
718	unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
719	// In principle we do not need to reserve SGPR pair used for flat_scratch if
720	// we know flat instructions do not access the stack anywhere in the
721	// program. For now assume it's needed if we have flat instructions.
722	const bool KernelUsesFlatScratch = hasFlatAddressSpace();
723	return getBaseReservedNumSGPRs(HasFlatScratch: KernelUsesFlatScratch);
724	}
725
726	unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
727	unsigned NumSGPRs,
728	unsigned NumVGPRs) const {
729	unsigned Occupancy =
730	std::min(getMaxWavesPerEU(),
731	getOccupancyWithLocalMemSize(LDSSize, F));
732	if (NumSGPRs)
733	Occupancy = std::min(a: Occupancy, b: getOccupancyWithNumSGPRs(SGPRs: NumSGPRs));
734	if (NumVGPRs)
735	Occupancy = std::min(a: Occupancy, b: getOccupancyWithNumVGPRs(NumVGPRs));
736	return Occupancy;
737	}
738
739	unsigned GCNSubtarget::getBaseMaxNumSGPRs(
740	const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
741	unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
742	// Compute maximum number of SGPRs function can use using default/requested
743	// minimum number of waves per execution unit.
744	unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: false);
745	unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: true);
746
747	// Check if maximum number of SGPRs was explicitly requested using
748	// "amdgpu-num-sgpr" attribute.
749	if (F.hasFnAttribute(Kind: "amdgpu-num-sgpr")) {
750	unsigned Requested =
751	F.getFnAttributeAsParsedInteger(Kind: "amdgpu-num-sgpr", Default: MaxNumSGPRs);
752
753	// Make sure requested value does not violate subtarget's specifications.
754	if (Requested && (Requested <= ReservedNumSGPRs))
755	Requested = `0`;
756
757	// If more SGPRs are required to support the input user/system SGPRs,
758	// increase to accommodate them.
759	//
760	// FIXME: This really ends up using the requested number of SGPRs + number
761	// of reserved special registers in total. Theoretically you could re-use
762	// the last input registers for these special registers, but this would
763	// require a lot of complexity to deal with the weird aliasing.
764	unsigned InputNumSGPRs = PreloadedSGPRs;
765	if (Requested && Requested < InputNumSGPRs)
766	Requested = InputNumSGPRs;
767
768	// Make sure requested value is compatible with values implied by
769	// default/requested minimum/maximum number of waves per execution unit.
770	if (Requested && Requested > getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: false))
771	Requested = `0`;
772	if (WavesPerEU.second &&
773	Requested && Requested < getMinNumSGPRs(WavesPerEU: WavesPerEU.second))
774	Requested = `0`;
775
776	if (Requested)
777	MaxNumSGPRs = Requested;
778	}
779
780	if (hasSGPRInitBug())
781	MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
782
783	return std::min(a: MaxNumSGPRs - ReservedNumSGPRs, b: MaxAddressableNumSGPRs);
784	}
785
786	unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
787	const Function &F = MF.getFunction();
788	const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
789	return getBaseMaxNumSGPRs(F, WavesPerEU: MFI.getWavesPerEU(), PreloadedSGPRs: MFI.getNumPreloadedSGPRs(),
790	ReservedNumSGPRs: getReservedNumSGPRs(MF));
791	}
792
793	static unsigned getMaxNumPreloadedSGPRs() {
794	using USI = GCNUserSGPRUsageInfo;
795	// Max number of user SGPRs
796	const unsigned MaxUserSGPRs =
797	USI::getNumUserSGPRForField(ID: USI::PrivateSegmentBufferID) +
798	USI::getNumUserSGPRForField(ID: USI::DispatchPtrID) +
799	USI::getNumUserSGPRForField(ID: USI::QueuePtrID) +
800	USI::getNumUserSGPRForField(ID: USI::KernargSegmentPtrID) +
801	USI::getNumUserSGPRForField(ID: USI::DispatchIdID) +
802	USI::getNumUserSGPRForField(ID: USI::FlatScratchInitID) +
803	USI::getNumUserSGPRForField(ID: USI::ImplicitBufferPtrID);
804
805	// Max number of system SGPRs
806	const unsigned MaxSystemSGPRs = `1` + // WorkGroupIDX
807	`1` + // WorkGroupIDY
808	`1` + // WorkGroupIDZ
809	`1` + // WorkGroupInfo
810	`1`; // private segment wave byte offset
811
812	// Max number of synthetic SGPRs
813	const unsigned SyntheticSGPRs = `1`; // LDSKernelId
814
815	return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;
816	}
817
818	unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
819	return getBaseMaxNumSGPRs(F, WavesPerEU: getWavesPerEU(F), PreloadedSGPRs: getMaxNumPreloadedSGPRs(),
820	ReservedNumSGPRs: getReservedNumSGPRs(F));
821	}
822
823	unsigned GCNSubtarget::getBaseMaxNumVGPRs(
824	const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
825	// Compute maximum number of VGPRs function can use using default/requested
826	// minimum number of waves per execution unit.
827	unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU: WavesPerEU.first);
828
829	// Check if maximum number of VGPRs was explicitly requested using
830	// "amdgpu-num-vgpr" attribute.
831	if (F.hasFnAttribute(Kind: "amdgpu-num-vgpr")) {
832	unsigned Requested =
833	F.getFnAttributeAsParsedInteger(Kind: "amdgpu-num-vgpr", Default: MaxNumVGPRs);
834
835	if (hasGFX90AInsts())
836	Requested *= `2`;
837
838	// Make sure requested value is compatible with values implied by
839	// default/requested minimum/maximum number of waves per execution unit.
840	if (Requested && Requested > getMaxNumVGPRs(WavesPerEU: WavesPerEU.first))
841	Requested = `0`;
842	if (WavesPerEU.second &&
843	Requested && Requested < getMinNumVGPRs(WavesPerEU: WavesPerEU.second))
844	Requested = `0`;
845
846	if (Requested)
847	MaxNumVGPRs = Requested;
848	}
849
850	return MaxNumVGPRs;
851	}
852
853	unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
854	return getBaseMaxNumVGPRs(F, WavesPerEU: getWavesPerEU(F));
855	}
856
857	unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
858	const Function &F = MF.getFunction();
859	const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
860	return getBaseMaxNumVGPRs(F, WavesPerEU: MFI.getWavesPerEU());
861	}
862
863	void GCNSubtarget::adjustSchedDependency(
864	SUnit Def, int* DefOpIdx, SUnit Use, int* UseOpIdx, SDep &Dep,
865	const TargetSchedModel SchedModel) const* {
866	if (Dep.getKind() != SDep::Kind::Data \|\| !Dep.getReg() \|\|
867	!Def->isInstr() \|\| !Use->isInstr())
868	return;
869
870	MachineInstr *DefI = Def->getInstr();
871	MachineInstr *UseI = Use->getInstr();
872
873	if (DefI->isBundle()) {
874	const SIRegisterInfo *TRI = getRegisterInfo();
875	auto Reg = Dep.getReg();
876	MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
877	MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
878	unsigned Lat = `0`;
879	for (++I; I != E && I ->isBundledWithPred(); ++I) {
880	if (I ->modifiesRegister(Reg, TRI))
881	Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
882	else if (Lat)
883	--Lat;
884	}
885	Dep.setLatency(Lat);
886	} else if (UseI->isBundle()) {
887	const SIRegisterInfo *TRI = getRegisterInfo();
888	auto Reg = Dep.getReg();
889	MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
890	MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
891	unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
892	for (++I; I != E && I ->isBundledWithPred() && Lat; ++I) {
893	if (I ->readsRegister(Reg, TRI))
894	break;
895	--Lat;
896	}
897	Dep.setLatency(Lat);
898	} else if (Dep.getLatency() == `0` && Dep.getReg() == AMDGPU::VCC_LO) {
899	// Work around the fact that SIInstrInfo::fixImplicitOperands modifies
900	// implicit operands which come from the MCInstrDesc, which can fool
901	// ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
902	// pseudo operands.
903	Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(
904	DefI, DefOpIdx, UseI, UseOpIdx));
905	}
906	}
907
908	namespace {
909	struct FillMFMAShadowMutation : ScheduleDAGMutation {
910	const SIInstrInfo *TII;
911
912	ScheduleDAGMI *DAG;
913
914	FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
915
916	bool isSALU(const SUnit SU) const* {
917	const MachineInstr *MI = SU->getInstr();
918	return MI && TII->isSALU(MI: *MI) && !MI->isTerminator();
919	}
920
921	bool isVALU(const SUnit SU) const* {
922	const MachineInstr *MI = SU->getInstr();
923	return MI && TII->isVALU(MI: *MI);
924	}
925
926	// Link as many SALU instructions in chain as possible. Return the size
927	// of the chain. Links up to MaxChain instructions.
928	unsigned linkSALUChain(SUnit From, SUnit To, unsigned MaxChain,
929	SmallPtrSetImpl<SUnit > &Visited) const* {
930	SmallVector<SUnit *, `8`> Worklist({To});
931	unsigned Linked = `0`;
932
933	while (!Worklist.empty() && MaxChain-- > `0`) {
934	SUnit *SU = Worklist.pop_back_val();
935	if (!Visited.insert(Ptr: SU).second)
936	continue;
937
938	LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
939	dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << `'\n'`);
940
941	if (SU != From && From != &DAG->ExitSU && DAG->canAddEdge(SuccSU: SU, PredSU: From))
942	if (DAG->addEdge(SuccSU: SU, PredDep: SDep (From, SDep::Artificial)))
943	++Linked;
944
945	for (SDep &SI : From->Succs) {
946	SUnit *SUv = SI.getSUnit();
947	if (SUv != From && SU != &DAG->ExitSU && isVALU(SU: SUv) &&
948	DAG->canAddEdge(SuccSU: SUv, PredSU: SU))
949	DAG->addEdge(SuccSU: SUv, PredDep: SDep (SU, SDep::Artificial));
950	}
951
952	for (SDep &SI : SU->Succs) {
953	SUnit *Succ = SI.getSUnit();
954	if (Succ != SU && isSALU(SU: Succ))
955	Worklist.push_back(Elt: Succ);
956	}
957	}
958
959	return Linked;
960	}
961
962	void apply(ScheduleDAGInstrs *DAGInstrs) override {
963	const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
964	if (!ST.hasMAIInsts())
965	return;
966	DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
967	const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
968	if (!TSchedModel \|\| DAG->SUnits.empty())
969	return;
970
971	// Scan for MFMA long latency instructions and try to add a dependency
972	// of available SALU instructions to give them a chance to fill MFMA
973	// shadow. That is desirable to fill MFMA shadow with SALU instructions
974	// rather than VALU to prevent power consumption bursts and throttle.
975	auto LastSALU = DAG->SUnits.begin();
976	auto E = DAG->SUnits.end();
977	SmallPtrSet<SUnit*, `32`> Visited;
978	for (SUnit &SU : DAG->SUnits) {
979	MachineInstr &MAI = *SU.getInstr();
980	if (!TII->isMAI(MAI) \|\|
981	MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 \|\|
982	MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
983	continue;
984
985	unsigned Lat = TSchedModel->computeInstrLatency(MI: &MAI) - `1`;
986
987	LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
988	dbgs() << "Need " << Lat
989	<< " instructions to cover latency.\n");
990
991	// Find up to Lat independent scalar instructions as early as
992	// possible such that they can be scheduled after this MFMA.
993	for ( ; Lat && LastSALU != E; ++LastSALU) {
994	if (Visited.count(Ptr: &*LastSALU))
995	continue;
996
997	if (&SU == &DAG->ExitSU \|\| &SU == &LastSALU \|\| !isSALU(SU: &LastSALU) \|\|
998	!DAG->canAddEdge(SuccSU: &*LastSALU, PredSU: &SU))
999	continue;
1000
1001	Lat -= linkSALUChain(From: &SU, To: &*LastSALU, MaxChain: Lat, Visited);
1002	}
1003	}
1004	}
1005	};
1006	} // namespace
1007
1008	void GCNSubtarget::getPostRAMutations(
1009	std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
1010	Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
1011	}
1012
1013	std::unique_ptr<ScheduleDAGMutation>
1014	GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo TII) const* {
1015	return EnablePowerSched ? std::make_unique<FillMFMAShadowMutation>(&InstrInfo)
1016	: nullptr;
1017	}
1018
1019	unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const {
1020	if (getGeneration() >= AMDGPUSubtarget::GFX12)
1021	return `0`; // Not MIMG encoding.
1022
1023	if (NSAThreshold.getNumOccurrences() > `0`)
1024	return std::max(a: NSAThreshold.getValue(), b: `2u`);
1025
1026	int Value = MF.getFunction().getFnAttributeAsParsedInteger(
1027	Kind: "amdgpu-nsa-threshold", Default: -`1`);
1028	if (Value > `0`)
1029	return std::max(a: Value, b: `2`);
1030
1031	return `3`;
1032	}
1033
1034	const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
1035	if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
1036	return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
1037	else
1038	return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
1039	}
1040
1041	const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
1042	if (TM.getTargetTriple().getArch() == Triple::amdgcn)
1043	return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
1044	else
1045	return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
1046	}
1047
1048	GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
1049	const GCNSubtarget &ST)
1050	: ST(ST) {
1051	const CallingConv::ID CC = F.getCallingConv();
1052	const bool IsKernel =
1053	CC == CallingConv::AMDGPU_KERNEL \|\| CC == CallingConv::SPIR_KERNEL;
1054	// FIXME: Should have analysis or something rather than attribute to detect
1055	// calls.
1056	const bool HasCalls = F.hasFnAttribute(Kind: "amdgpu-calls");
1057	// FIXME: This attribute is a hack, we just need an analysis on the function
1058	// to look for allocas.
1059	const bool HasStackObjects = F.hasFnAttribute(Kind: "amdgpu-stack-objects");
1060
1061	if (IsKernel && (!F.arg_empty() \|\| ST.getImplicitArgNumBytes(F) != `0`))
1062	KernargSegmentPtr = true;
1063
1064	bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
1065	if (IsAmdHsaOrMesa && !ST.enableFlatScratch())
1066	PrivateSegmentBuffer = true;
1067	else if (ST.isMesaGfxShader(F))
1068	ImplicitBufferPtr = true;
1069
1070	if (!AMDGPU::isGraphics(CC)) {
1071	if (!F.hasFnAttribute(Kind: "amdgpu-no-dispatch-ptr"))
1072	DispatchPtr = true;
1073
1074	// FIXME: Can this always be disabled with < COv5?
1075	if (!F.hasFnAttribute(Kind: "amdgpu-no-queue-ptr"))
1076	QueuePtr = true;
1077
1078	if (!F.hasFnAttribute(Kind: "amdgpu-no-dispatch-id"))
1079	DispatchID = true;
1080	}
1081
1082	// TODO: This could be refined a lot. The attribute is a poor way of
1083	// detecting calls or stack objects that may require it before argument
1084	// lowering.
1085	if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
1086	(IsAmdHsaOrMesa \|\| ST.enableFlatScratch()) &&
1087	(HasCalls \|\| HasStackObjects \|\| ST.enableFlatScratch()) &&
1088	!ST.flatScratchIsArchitected()) {
1089	FlatScratchInit = true;
1090	}
1091
1092	if (hasImplicitBufferPtr())
1093	NumUsedUserSGPRs += getNumUserSGPRForField(ID: ImplicitBufferPtrID);
1094
1095	if (hasPrivateSegmentBuffer())
1096	NumUsedUserSGPRs += getNumUserSGPRForField(ID: PrivateSegmentBufferID);
1097
1098	if (hasDispatchPtr())
1099	NumUsedUserSGPRs += getNumUserSGPRForField(ID: DispatchPtrID);
1100
1101	if (hasQueuePtr())
1102	NumUsedUserSGPRs += getNumUserSGPRForField(ID: QueuePtrID);
1103
1104	if (hasKernargSegmentPtr())
1105	NumUsedUserSGPRs += getNumUserSGPRForField(ID: KernargSegmentPtrID);
1106
1107	if (hasDispatchID())
1108	NumUsedUserSGPRs += getNumUserSGPRForField(ID: DispatchIdID);
1109
1110	if (hasFlatScratchInit())
1111	NumUsedUserSGPRs += getNumUserSGPRForField(ID: FlatScratchInitID);
1112	}
1113
1114	void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
1115	assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST));
1116	NumKernargPreloadSGPRs += NumSGPRs;
1117	NumUsedUserSGPRs += NumSGPRs;
1118	}
1119
1120	unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
1121	return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;
1122	}
1123
1124	SmallVector<unsigned>
1125	AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {
1126	return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", `3`);
1127	}
1128

source code of llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp