GCNSubtarget.h source code [llvm/lib/Target/AMDGPU/GCNSubtarget.h]

1	//=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------- C++ --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//==-----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// AMD GCN specific subclass of TargetSubtarget.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
15	#define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
16
17	#include "AMDGPUCallLowering.h"
18	#include "AMDGPURegisterBankInfo.h"
19	#include "AMDGPUSubtarget.h"
20	#include "SIFrameLowering.h"
21	#include "SIISelLowering.h"
22	#include "SIInstrInfo.h"
23	#include "Utils/AMDGPUBaseInfo.h"
24	#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
25	#include "llvm/Support/ErrorHandling.h"
26
27	#define GET_SUBTARGETINFO_HEADER
28	#include "AMDGPUGenSubtargetInfo.inc"
29
30	namespace llvm {
31
32	class GCNTargetMachine;
33
34	class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
35	public AMDGPUSubtarget {
36	public:
37	using AMDGPUSubtarget::getMaxWavesPerEU;
38
39	// Following 2 enums are documented at:
40	// - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
41	enum class TrapHandlerAbi {
42	NONE = `0x00`,
43	AMDHSA = `0x01`,
44	};
45
46	enum class TrapID {
47	LLVMAMDHSATrap = `0x02`,
48	LLVMAMDHSADebugTrap = `0x03`,
49	};
50
51	private:
52	/// GlobalISel related APIs.
53	std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
54	std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
55	std::unique_ptr<InstructionSelector> InstSelector;
56	std::unique_ptr<LegalizerInfo> Legalizer;
57	std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo;
58
59	protected:
60	// Basic subtarget description.
61	Triple TargetTriple;
62	AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
63	unsigned Gen = INVALID;
64	InstrItineraryData InstrItins;
65	int LDSBankCount = `0`;
66	unsigned MaxPrivateElementSize = `0`;
67
68	// Possibly statically set by tablegen, but may want to be overridden.
69	bool FastDenormalF32 = false;
70	bool HalfRate64Ops = false;
71	bool FullRate64Ops = false;
72
73	// Dynamically set bits that enable features.
74	bool FlatForGlobal = false;
75	bool AutoWaitcntBeforeBarrier = false;
76	bool BackOffBarrier = false;
77	bool UnalignedScratchAccess = false;
78	bool UnalignedAccessMode = false;
79	bool HasApertureRegs = false;
80	bool SupportsXNACK = false;
81	bool KernargPreload = false;
82
83	// This should not be used directly. 'TargetID' tracks the dynamic settings
84	// for XNACK.
85	bool EnableXNACK = false;
86
87	bool EnableTgSplit = false;
88	bool EnableCuMode = false;
89	bool TrapHandler = false;
90
91	// Used as options.
92	bool EnableLoadStoreOpt = false;
93	bool EnableUnsafeDSOffsetFolding = false;
94	bool EnableSIScheduler = false;
95	bool EnableDS128 = false;
96	bool EnablePRTStrictNull = false;
97	bool DumpCode = false;
98
99	// Subtarget statically properties set by tablegen
100	bool FP64 = false;
101	bool FMA = false;
102	bool MIMG_R128 = false;
103	bool CIInsts = false;
104	bool GFX8Insts = false;
105	bool GFX9Insts = false;
106	bool GFX90AInsts = false;
107	bool GFX940Insts = false;
108	bool GFX10Insts = false;
109	bool GFX11Insts = false;
110	bool GFX12Insts = false;
111	bool GFX10_3Insts = false;
112	bool GFX7GFX8GFX9Insts = false;
113	bool SGPRInitBug = false;
114	bool UserSGPRInit16Bug = false;
115	bool NegativeScratchOffsetBug = false;
116	bool NegativeUnalignedScratchOffsetBug = false;
117	bool HasSMemRealTime = false;
118	bool HasIntClamp = false;
119	bool HasFmaMixInsts = false;
120	bool HasMovrel = false;
121	bool HasVGPRIndexMode = false;
122	bool HasScalarDwordx3Loads = false;
123	bool HasScalarStores = false;
124	bool HasScalarAtomics = false;
125	bool HasSDWAOmod = false;
126	bool HasSDWAScalar = false;
127	bool HasSDWASdst = false;
128	bool HasSDWAMac = false;
129	bool HasSDWAOutModsVOPC = false;
130	bool HasDPP = false;
131	bool HasDPP8 = false;
132	bool HasDPALU_DPP = false;
133	bool HasDPPSrc1SGPR = false;
134	bool HasPackedFP32Ops = false;
135	bool HasImageInsts = false;
136	bool HasExtendedImageInsts = false;
137	bool HasR128A16 = false;
138	bool HasA16 = false;
139	bool HasG16 = false;
140	bool HasNSAEncoding = false;
141	bool HasPartialNSAEncoding = false;
142	bool GFX10_AEncoding = false;
143	bool GFX10_BEncoding = false;
144	bool HasDLInsts = false;
145	bool HasFmacF64Inst = false;
146	bool HasDot1Insts = false;
147	bool HasDot2Insts = false;
148	bool HasDot3Insts = false;
149	bool HasDot4Insts = false;
150	bool HasDot5Insts = false;
151	bool HasDot6Insts = false;
152	bool HasDot7Insts = false;
153	bool HasDot8Insts = false;
154	bool HasDot9Insts = false;
155	bool HasDot10Insts = false;
156	bool HasMAIInsts = false;
157	bool HasFP8Insts = false;
158	bool HasFP8ConversionInsts = false;
159	bool HasPkFmacF16Inst = false;
160	bool HasAtomicDsPkAdd16Insts = false;
161	bool HasAtomicFlatPkAdd16Insts = false;
162	bool HasAtomicFaddRtnInsts = false;
163	bool HasAtomicFaddNoRtnInsts = false;
164	bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false;
165	bool HasAtomicBufferGlobalPkAddF16Insts = false;
166	bool HasAtomicCSubNoRtnInsts = false;
167	bool HasAtomicGlobalPkAddBF16Inst = false;
168	bool HasFlatAtomicFaddF32Inst = false;
169	bool HasDefaultComponentZero = false;
170	bool HasDefaultComponentBroadcast = false;
171	bool SupportsSRAMECC = false;
172
173	// This should not be used directly. 'TargetID' tracks the dynamic settings
174	// for SRAMECC.
175	bool EnableSRAMECC = false;
176
177	bool HasNoSdstCMPX = false;
178	bool HasVscnt = false;
179	bool HasGetWaveIdInst = false;
180	bool HasSMemTimeInst = false;
181	bool HasShaderCyclesRegister = false;
182	bool HasShaderCyclesHiLoRegisters = false;
183	bool HasVOP3Literal = false;
184	bool HasNoDataDepHazard = false;
185	bool FlatAddressSpace = false;
186	bool FlatInstOffsets = false;
187	bool FlatGlobalInsts = false;
188	bool FlatScratchInsts = false;
189	bool ScalarFlatScratchInsts = false;
190	bool HasArchitectedFlatScratch = false;
191	bool EnableFlatScratch = false;
192	bool HasArchitectedSGPRs = false;
193	bool HasGDS = false;
194	bool HasGWS = false;
195	bool AddNoCarryInsts = false;
196	bool HasUnpackedD16VMem = false;
197	bool LDSMisalignedBug = false;
198	bool HasMFMAInlineLiteralBug = false;
199	bool UnalignedBufferAccess = false;
200	bool UnalignedDSAccess = false;
201	bool HasPackedTID = false;
202	bool ScalarizeGlobal = false;
203	bool HasSALUFloatInsts = false;
204	bool HasVGPRSingleUseHintInsts = false;
205	bool HasPseudoScalarTrans = false;
206	bool HasRestrictedSOffset = false;
207
208	bool HasVcmpxPermlaneHazard = false;
209	bool HasVMEMtoScalarWriteHazard = false;
210	bool HasSMEMtoVectorWriteHazard = false;
211	bool HasInstFwdPrefetchBug = false;
212	bool HasVcmpxExecWARHazard = false;
213	bool HasLdsBranchVmemWARHazard = false;
214	bool HasNSAtoVMEMBug = false;
215	bool HasNSAClauseBug = false;
216	bool HasOffset3fBug = false;
217	bool HasFlatSegmentOffsetBug = false;
218	bool HasImageStoreD16Bug = false;
219	bool HasImageGather4D16Bug = false;
220	bool HasMSAALoadDstSelBug = false;
221	bool HasGFX11FullVGPRs = false;
222	bool HasMADIntraFwdBug = false;
223	bool HasVOPDInsts = false;
224	bool HasVALUTransUseHazard = false;
225	bool HasForceStoreSC0SC1 = false;
226
227	bool RequiresCOV6 = false;
228
229	// Dummy feature to use for assembler in tablegen.
230	bool FeatureDisable = false;
231
232	SelectionDAGTargetInfo TSInfo;
233	private:
234	SIInstrInfo InstrInfo;
235	SITargetLowering TLInfo;
236	SIFrameLowering FrameLowering;
237
238	public:
239	GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
240	const GCNTargetMachine &TM);
241	~GCNSubtarget() override;
242
243	GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
244	StringRef GPU, StringRef FS);
245
246	const SIInstrInfo getInstrInfo() const* override {
247	return &InstrInfo;
248	}
249
250	const SIFrameLowering getFrameLowering() const* override {
251	return &FrameLowering;
252	}
253
254	const SITargetLowering getTargetLowering() const* override {
255	return &TLInfo;
256	}
257
258	const SIRegisterInfo getRegisterInfo() const* override {
259	return &InstrInfo.getRegisterInfo();
260	}
261
262	const CallLowering getCallLowering() const* override {
263	return CallLoweringInfo.get();
264	}
265
266	const InlineAsmLowering getInlineAsmLowering() const* override {
267	return InlineAsmLoweringInfo.get();
268	}
269
270	InstructionSelector getInstructionSelector() const* override {
271	return InstSelector.get();
272	}
273
274	const LegalizerInfo getLegalizerInfo() const* override {
275	return Legalizer.get();
276	}
277
278	const AMDGPURegisterBankInfo getRegBankInfo() const* override {
279	return RegBankInfo.get();
280	}
281
282	const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const {
283	return TargetID;
284	}
285
286	// Nothing implemented, just prevent crashes on use.
287	const SelectionDAGTargetInfo getSelectionDAGInfo() const* override {
288	return &TSInfo;
289	}
290
291	const InstrItineraryData getInstrItineraryData() const* override {
292	return &InstrItins;
293	}
294
295	void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
296
297	Generation getGeneration() const {
298	return (Generation)Gen;
299	}
300
301	unsigned getMaxWaveScratchSize() const {
302	// See COMPUTE_TMPRING_SIZE.WAVESIZE.
303	if (getGeneration() >= GFX12) {
304	// 18-bit field in units of 64-dword.
305	return (`64` * `4`) * ((`1` << `18`) - `1`);
306	}
307	if (getGeneration() == GFX11) {
308	// 15-bit field in units of 64-dword.
309	return (`64` * `4`) * ((`1` << `15`) - `1`);
310	}
311	// 13-bit field in units of 256-dword.
312	return (`256` * `4`) * ((`1` << `13`) - `1`);
313	}
314
315	/// Return the number of high bits known to be zero for a frame index.
316	unsigned getKnownHighZeroBitsForFrameIndex() const {
317	return llvm::countl_zero(getMaxWaveScratchSize()) + getWavefrontSizeLog2();
318	}
319
320	int getLDSBankCount() const {
321	return LDSBankCount;
322	}
323
324	unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
325	return (ForBufferRSrc \|\| !enableFlatScratch()) ? MaxPrivateElementSize : `16`;
326	}
327
328	unsigned getConstantBusLimit(unsigned Opcode) const;
329
330	/// Returns if the result of this instruction with a 16-bit result returned in
331	/// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
332	/// the original value.
333	bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
334
335	bool supportsWGP() const { return getGeneration() >= GFX10; }
336
337	bool hasIntClamp() const {
338	return HasIntClamp;
339	}
340
341	bool hasFP64() const {
342	return FP64;
343	}
344
345	bool hasMIMG_R128() const {
346	return MIMG_R128;
347	}
348
349	bool hasHWFP64() const {
350	return FP64;
351	}
352
353	bool hasHalfRate64Ops() const {
354	return HalfRate64Ops;
355	}
356
357	bool hasFullRate64Ops() const {
358	return FullRate64Ops;
359	}
360
361	bool hasAddr64() const {
362	return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
363	}
364
365	bool hasFlat() const {
366	return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS);
367	}
368
369	// Return true if the target only has the reverse operand versions of VALU
370	// shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
371	bool hasOnlyRevVALUShifts() const {
372	return getGeneration() >= VOLCANIC_ISLANDS;
373	}
374
375	bool hasFractBug() const {
376	return getGeneration() == SOUTHERN_ISLANDS;
377	}
378
379	bool hasBFE() const {
380	return true;
381	}
382
383	bool hasBFI() const {
384	return true;
385	}
386
387	bool hasBFM() const {
388	return hasBFE();
389	}
390
391	bool hasBCNT(unsigned Size) const {
392	return true;
393	}
394
395	bool hasFFBL() const {
396	return true;
397	}
398
399	bool hasFFBH() const {
400	return true;
401	}
402
403	bool hasMed3_16() const {
404	return getGeneration() >= AMDGPUSubtarget::GFX9;
405	}
406
407	bool hasMin3Max3_16() const {
408	return getGeneration() >= AMDGPUSubtarget::GFX9;
409	}
410
411	bool hasFmaMixInsts() const {
412	return HasFmaMixInsts;
413	}
414
415	bool hasCARRY() const {
416	return true;
417	}
418
419	bool hasFMA() const {
420	return FMA;
421	}
422
423	bool hasSwap() const {
424	return GFX9Insts;
425	}
426
427	bool hasScalarPackInsts() const {
428	return GFX9Insts;
429	}
430
431	bool hasScalarMulHiInsts() const {
432	return GFX9Insts;
433	}
434
435	bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
436
437	TrapHandlerAbi getTrapHandlerAbi() const {
438	return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
439	}
440
441	bool supportsGetDoorbellID() const {
442	// The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
443	return getGeneration() >= GFX9;
444	}
445
446	/// True if the offset field of DS instructions works as expected. On SI, the
447	/// offset uses a 16-bit adder and does not always wrap properly.
448	bool hasUsableDSOffset() const {
449	return getGeneration() >= SEA_ISLANDS;
450	}
451
452	bool unsafeDSOffsetFoldingEnabled() const {
453	return EnableUnsafeDSOffsetFolding;
454	}
455
456	/// Condition output from div_scale is usable.
457	bool hasUsableDivScaleConditionOutput() const {
458	return getGeneration() != SOUTHERN_ISLANDS;
459	}
460
461	/// Extra wait hazard is needed in some cases before
462	/// s_cbranch_vccnz/s_cbranch_vccz.
463	bool hasReadVCCZBug() const {
464	return getGeneration() <= SEA_ISLANDS;
465	}
466
467	/// Writes to VCC_LO/VCC_HI update the VCCZ flag.
468	bool partialVCCWritesUpdateVCCZ() const {
469	return getGeneration() >= GFX10;
470	}
471
472	/// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
473	/// was written by a VALU instruction.
474	bool hasSMRDReadVALUDefHazard() const {
475	return getGeneration() == SOUTHERN_ISLANDS;
476	}
477
478	/// A read of an SGPR by a VMEM instruction requires 5 wait states when the
479	/// SGPR was written by a VALU Instruction.
480	bool hasVMEMReadSGPRVALUDefHazard() const {
481	return getGeneration() >= VOLCANIC_ISLANDS;
482	}
483
484	bool hasRFEHazards() const {
485	return getGeneration() >= VOLCANIC_ISLANDS;
486	}
487
488	/// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
489	unsigned getSetRegWaitStates() const {
490	return getGeneration() <= SEA_ISLANDS ? `1` : `2`;
491	}
492
493	bool dumpCode() const {
494	return DumpCode;
495	}
496
497	/// Return the amount of LDS that can be used that will not restrict the
498	/// occupancy lower than WaveCount.
499	unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
500	const Function &) const;
501
502	bool supportsMinMaxDenormModes() const {
503	return getGeneration() >= AMDGPUSubtarget::GFX9;
504	}
505
506	/// \returns If target supports S_DENORM_MODE.
507	bool hasDenormModeInst() const {
508	return getGeneration() >= AMDGPUSubtarget::GFX10;
509	}
510
511	bool useFlatForGlobal() const {
512	return FlatForGlobal;
513	}
514
515	/// \returns If target supports ds_read/write_b128 and user enables generation
516	/// of ds_read/write_b128.
517	bool useDS128() const {
518	return CIInsts && EnableDS128;
519	}
520
521	/// \return If target supports ds_read/write_b96/128.
522	bool hasDS96AndDS128() const {
523	return CIInsts;
524	}
525
526	/// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
527	bool haveRoundOpsF64() const {
528	return CIInsts;
529	}
530
531	/// \returns If MUBUF instructions always perform range checking, even for
532	/// buffer resources used for private memory access.
533	bool privateMemoryResourceIsRangeChecked() const {
534	return getGeneration() < AMDGPUSubtarget::GFX9;
535	}
536
537	/// \returns If target requires PRT Struct NULL support (zero result registers
538	/// for sparse texture support).
539	bool usePRTStrictNull() const {
540	return EnablePRTStrictNull;
541	}
542
543	bool hasAutoWaitcntBeforeBarrier() const {
544	return AutoWaitcntBeforeBarrier;
545	}
546
547	/// \returns true if the target supports backing off of s_barrier instructions
548	/// when an exception is raised.
549	bool supportsBackOffBarrier() const {
550	return BackOffBarrier;
551	}
552
553	bool hasUnalignedBufferAccess() const {
554	return UnalignedBufferAccess;
555	}
556
557	bool hasUnalignedBufferAccessEnabled() const {
558	return UnalignedBufferAccess && UnalignedAccessMode;
559	}
560
561	bool hasUnalignedDSAccess() const {
562	return UnalignedDSAccess;
563	}
564
565	bool hasUnalignedDSAccessEnabled() const {
566	return UnalignedDSAccess && UnalignedAccessMode;
567	}
568
569	bool hasUnalignedScratchAccess() const {
570	return UnalignedScratchAccess;
571	}
572
573	bool hasUnalignedAccessMode() const {
574	return UnalignedAccessMode;
575	}
576
577	bool hasApertureRegs() const {
578	return HasApertureRegs;
579	}
580
581	bool isTrapHandlerEnabled() const {
582	return TrapHandler;
583	}
584
585	bool isXNACKEnabled() const {
586	return TargetID.isXnackOnOrAny();
587	}
588
589	bool isTgSplitEnabled() const {
590	return EnableTgSplit;
591	}
592
593	bool isCuModeEnabled() const {
594	return EnableCuMode;
595	}
596
597	bool hasFlatAddressSpace() const {
598	return FlatAddressSpace;
599	}
600
601	bool hasFlatScrRegister() const {
602	return hasFlatAddressSpace();
603	}
604
605	bool hasFlatInstOffsets() const {
606	return FlatInstOffsets;
607	}
608
609	bool hasFlatGlobalInsts() const {
610	return FlatGlobalInsts;
611	}
612
613	bool hasFlatScratchInsts() const {
614	return FlatScratchInsts;
615	}
616
617	// Check if target supports ST addressing mode with FLAT scratch instructions.
618	// The ST addressing mode means no registers are used, either VGPR or SGPR,
619	// but only immediate offset is swizzled and added to the FLAT scratch base.
620	bool hasFlatScratchSTMode() const {
621	return hasFlatScratchInsts() && (hasGFX10_3Insts() \|\| hasGFX940Insts());
622	}
623
624	bool hasFlatScratchSVSMode() const { return GFX940Insts \|\| GFX11Insts; }
625
626	bool hasScalarFlatScratchInsts() const {
627	return ScalarFlatScratchInsts;
628	}
629
630	bool enableFlatScratch() const {
631	return flatScratchIsArchitected() \|\|
632	(EnableFlatScratch && hasFlatScratchInsts());
633	}
634
635	bool hasGlobalAddTidInsts() const {
636	return GFX10_BEncoding;
637	}
638
639	bool hasAtomicCSub() const {
640	return GFX10_BEncoding;
641	}
642
643	// BUFFER/FLAT/GLOBAL_ATOMIC_ADD/MIN/MAX_F64
644	bool hasBufferFlatGlobalAtomicsF64() const { return hasGFX90AInsts(); }
645
646	// DS_ADD_F64/DS_ADD_RTN_F64
647	bool hasLdsAtomicAddF64() const { return hasGFX90AInsts(); }
648
649	bool hasMultiDwordFlatScratchAddressing() const {
650	return getGeneration() >= GFX9;
651	}
652
653	bool hasFlatSegmentOffsetBug() const {
654	return HasFlatSegmentOffsetBug;
655	}
656
657	bool hasFlatLgkmVMemCountInOrder() const {
658	return getGeneration() > GFX9;
659	}
660
661	bool hasD16LoadStore() const {
662	return getGeneration() >= GFX9;
663	}
664
665	bool d16PreservesUnusedBits() const {
666	return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
667	}
668
669	bool hasD16Images() const {
670	return getGeneration() >= VOLCANIC_ISLANDS;
671	}
672
673	/// Return if most LDS instructions have an m0 use that require m0 to be
674	/// initialized.
675	bool ldsRequiresM0Init() const {
676	return getGeneration() < GFX9;
677	}
678
679	// True if the hardware rewinds and replays GWS operations if a wave is
680	// preempted.
681	//
682	// If this is false, a GWS operation requires testing if a nack set the
683	// MEM_VIOL bit, and repeating if so.
684	bool hasGWSAutoReplay() const {
685	return getGeneration() >= GFX9;
686	}
687
688	/// \returns if target has ds_gws_sema_release_all instruction.
689	bool hasGWSSemaReleaseAll() const {
690	return CIInsts;
691	}
692
693	/// \returns true if the target has integer add/sub instructions that do not
694	/// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
695	/// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
696	/// for saturation.
697	bool hasAddNoCarry() const {
698	return AddNoCarryInsts;
699	}
700
701	bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
702
703	bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
704
705	bool hasUnpackedD16VMem() const {
706	return HasUnpackedD16VMem;
707	}
708
709	// Covers VS/PS/CS graphics shaders
710	bool isMesaGfxShader(const Function &F) const {
711	return isMesa3DOS() && AMDGPU::isShader(CC: F.getCallingConv());
712	}
713
714	bool hasMad64_32() const {
715	return getGeneration() >= SEA_ISLANDS;
716	}
717
718	bool hasSDWAOmod() const {
719	return HasSDWAOmod;
720	}
721
722	bool hasSDWAScalar() const {
723	return HasSDWAScalar;
724	}
725
726	bool hasSDWASdst() const {
727	return HasSDWASdst;
728	}
729
730	bool hasSDWAMac() const {
731	return HasSDWAMac;
732	}
733
734	bool hasSDWAOutModsVOPC() const {
735	return HasSDWAOutModsVOPC;
736	}
737
738	bool hasDLInsts() const {
739	return HasDLInsts;
740	}
741
742	bool hasFmacF64Inst() const { return HasFmacF64Inst; }
743
744	bool hasDot1Insts() const {
745	return HasDot1Insts;
746	}
747
748	bool hasDot2Insts() const {
749	return HasDot2Insts;
750	}
751
752	bool hasDot3Insts() const {
753	return HasDot3Insts;
754	}
755
756	bool hasDot4Insts() const {
757	return HasDot4Insts;
758	}
759
760	bool hasDot5Insts() const {
761	return HasDot5Insts;
762	}
763
764	bool hasDot6Insts() const {
765	return HasDot6Insts;
766	}
767
768	bool hasDot7Insts() const {
769	return HasDot7Insts;
770	}
771
772	bool hasDot8Insts() const {
773	return HasDot8Insts;
774	}
775
776	bool hasDot9Insts() const {
777	return HasDot9Insts;
778	}
779
780	bool hasDot10Insts() const {
781	return HasDot10Insts;
782	}
783
784	bool hasMAIInsts() const {
785	return HasMAIInsts;
786	}
787
788	bool hasFP8Insts() const {
789	return HasFP8Insts;
790	}
791
792	bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; }
793
794	bool hasPkFmacF16Inst() const {
795	return HasPkFmacF16Inst;
796	}
797
798	bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; }
799
800	bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; }
801
802	bool hasAtomicFaddInsts() const {
803	return HasAtomicFaddRtnInsts \|\| HasAtomicFaddNoRtnInsts;
804	}
805
806	bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; }
807
808	bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; }
809
810	bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const {
811	return HasAtomicBufferGlobalPkAddF16NoRtnInsts;
812	}
813
814	bool hasAtomicBufferGlobalPkAddF16Insts() const {
815	return HasAtomicBufferGlobalPkAddF16Insts;
816	}
817
818	bool hasAtomicGlobalPkAddBF16Inst() const {
819	return HasAtomicGlobalPkAddBF16Inst;
820	}
821
822	bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
823
824	bool hasDefaultComponentZero() const { return HasDefaultComponentZero; }
825
826	bool hasDefaultComponentBroadcast() const {
827	return HasDefaultComponentBroadcast;
828	}
829
830	bool hasNoSdstCMPX() const {
831	return HasNoSdstCMPX;
832	}
833
834	bool hasVscnt() const {
835	return HasVscnt;
836	}
837
838	bool hasGetWaveIdInst() const {
839	return HasGetWaveIdInst;
840	}
841
842	bool hasSMemTimeInst() const {
843	return HasSMemTimeInst;
844	}
845
846	bool hasShaderCyclesRegister() const {
847	return HasShaderCyclesRegister;
848	}
849
850	bool hasShaderCyclesHiLoRegisters() const {
851	return HasShaderCyclesHiLoRegisters;
852	}
853
854	bool hasVOP3Literal() const {
855	return HasVOP3Literal;
856	}
857
858	bool hasNoDataDepHazard() const {
859	return HasNoDataDepHazard;
860	}
861
862	bool vmemWriteNeedsExpWaitcnt() const {
863	return getGeneration() < SEA_ISLANDS;
864	}
865
866	bool hasInstPrefetch() const {
867	return getGeneration() == GFX10 \|\| getGeneration() == GFX11;
868	}
869
870	bool hasPrefetch() const { return GFX12Insts; }
871
872	// Has s_cmpk_ instructions.*
873	bool hasSCmpK() const { return getGeneration() < GFX12; }
874
875	// Scratch is allocated in 256 dword per wave blocks for the entire
876	// wavefront. When viewed from the perspective of an arbitrary workitem, this
877	// is 4-byte aligned.
878	//
879	// Only 4-byte alignment is really needed to access anything. Transformations
880	// on the pointer value itself may rely on the alignment / known low bits of
881	// the pointer. Set this to something above the minimum to avoid needing
882	// dynamic realignment in common cases.
883	Align getStackAlignment() const { return Align (`16`); }
884
885	bool enableMachineScheduler() const override {
886	return true;
887	}
888
889	bool useAA() const override;
890
891	bool enableSubRegLiveness() const override {
892	return true;
893	}
894
895	void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
896	bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
897
898	// static wrappers
899	static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
900
901	// XXX - Why is this here if it isn't in the default pass set?
902	bool enableEarlyIfConversion() const override {
903	return true;
904	}
905
906	void overrideSchedPolicy(MachineSchedPolicy &Policy,
907	unsigned NumRegionInstrs) const override;
908
909	unsigned getMaxNumUserSGPRs() const {
910	return AMDGPU::getMaxNumUserSGPRs(*this);
911	}
912
913	bool hasSMemRealTime() const {
914	return HasSMemRealTime;
915	}
916
917	bool hasMovrel() const {
918	return HasMovrel;
919	}
920
921	bool hasVGPRIndexMode() const {
922	return HasVGPRIndexMode;
923	}
924
925	bool useVGPRIndexMode() const;
926
927	bool hasScalarCompareEq64() const {
928	return getGeneration() >= VOLCANIC_ISLANDS;
929	}
930
931	bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; }
932
933	bool hasScalarStores() const {
934	return HasScalarStores;
935	}
936
937	bool hasScalarAtomics() const {
938	return HasScalarAtomics;
939	}
940
941	bool hasLDSFPAtomicAdd() const { return GFX8Insts; }
942
943	/// \returns true if the subtarget has the v_permlanex16_b32 instruction.
944	bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
945
946	/// \returns true if the subtarget has the v_permlane64_b32 instruction.
947	bool hasPermLane64() const { return getGeneration() >= GFX11; }
948
949	bool hasDPP() const {
950	return HasDPP;
951	}
952
953	bool hasDPPBroadcasts() const {
954	return HasDPP && getGeneration() < GFX10;
955	}
956
957	bool hasDPPWavefrontShifts() const {
958	return HasDPP && getGeneration() < GFX10;
959	}
960
961	bool hasDPP8() const {
962	return HasDPP8;
963	}
964
965	bool hasDPALU_DPP() const {
966	return HasDPALU_DPP;
967	}
968
969	bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; }
970
971	bool hasPackedFP32Ops() const {
972	return HasPackedFP32Ops;
973	}
974
975	// Has V_PK_MOV_B32 opcode
976	bool hasPkMovB32() const {
977	return GFX90AInsts;
978	}
979
980	bool hasFmaakFmamkF32Insts() const {
981	return getGeneration() >= GFX10 \|\| hasGFX940Insts();
982	}
983
984	bool hasImageInsts() const {
985	return HasImageInsts;
986	}
987
988	bool hasExtendedImageInsts() const {
989	return HasExtendedImageInsts;
990	}
991
992	bool hasR128A16() const {
993	return HasR128A16;
994	}
995
996	bool hasA16() const { return HasA16; }
997
998	bool hasG16() const { return HasG16; }
999
1000	bool hasOffset3fBug() const {
1001	return HasOffset3fBug;
1002	}
1003
1004	bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
1005
1006	bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }
1007
1008	bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; }
1009
1010	bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; }
1011
1012	bool hasNSAEncoding() const { return HasNSAEncoding; }
1013
1014	bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
1015
1016	bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; }
1017
1018	unsigned getNSAMaxSize(bool HasSampler = false) const {
1019	return AMDGPU::getNSAMaxSize(*this, HasSampler);
1020	}
1021
1022	bool hasGFX10_AEncoding() const {
1023	return GFX10_AEncoding;
1024	}
1025
1026	bool hasGFX10_BEncoding() const {
1027	return GFX10_BEncoding;
1028	}
1029
1030	bool hasGFX10_3Insts() const {
1031	return GFX10_3Insts;
1032	}
1033
1034	bool hasMadF16() const;
1035
1036	bool hasMovB64() const { return GFX940Insts; }
1037
1038	bool hasLshlAddB64() const { return GFX940Insts; }
1039
1040	bool enableSIScheduler() const {
1041	return EnableSIScheduler;
1042	}
1043
1044	bool loadStoreOptEnabled() const {
1045	return EnableLoadStoreOpt;
1046	}
1047
1048	bool hasSGPRInitBug() const {
1049	return SGPRInitBug;
1050	}
1051
1052	bool hasUserSGPRInit16Bug() const {
1053	return UserSGPRInit16Bug && isWave32();
1054	}
1055
1056	bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
1057
1058	bool hasNegativeUnalignedScratchOffsetBug() const {
1059	return NegativeUnalignedScratchOffsetBug;
1060	}
1061
1062	bool hasMFMAInlineLiteralBug() const {
1063	return HasMFMAInlineLiteralBug;
1064	}
1065
1066	bool has12DWordStoreHazard() const {
1067	return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
1068	}
1069
1070	// \returns true if the subtarget supports DWORDX3 load/store instructions.
1071	bool hasDwordx3LoadStores() const {
1072	return CIInsts;
1073	}
1074
1075	bool hasReadM0MovRelInterpHazard() const {
1076	return getGeneration() == AMDGPUSubtarget::GFX9;
1077	}
1078
1079	bool hasReadM0SendMsgHazard() const {
1080	return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1081	getGeneration() <= AMDGPUSubtarget::GFX9;
1082	}
1083
1084	bool hasReadM0LdsDmaHazard() const {
1085	return getGeneration() == AMDGPUSubtarget::GFX9;
1086	}
1087
1088	bool hasReadM0LdsDirectHazard() const {
1089	return getGeneration() == AMDGPUSubtarget::GFX9;
1090	}
1091
1092	bool hasVcmpxPermlaneHazard() const {
1093	return HasVcmpxPermlaneHazard;
1094	}
1095
1096	bool hasVMEMtoScalarWriteHazard() const {
1097	return HasVMEMtoScalarWriteHazard;
1098	}
1099
1100	bool hasSMEMtoVectorWriteHazard() const {
1101	return HasSMEMtoVectorWriteHazard;
1102	}
1103
1104	bool hasLDSMisalignedBug() const {
1105	return LDSMisalignedBug && !EnableCuMode;
1106	}
1107
1108	bool hasInstFwdPrefetchBug() const {
1109	return HasInstFwdPrefetchBug;
1110	}
1111
1112	bool hasVcmpxExecWARHazard() const {
1113	return HasVcmpxExecWARHazard;
1114	}
1115
1116	bool hasLdsBranchVmemWARHazard() const {
1117	return HasLdsBranchVmemWARHazard;
1118	}
1119
1120	// Shift amount of a 64 bit shift cannot be a highest allocated register
1121	// if also at the end of the allocation block.
1122	bool hasShift64HighRegBug() const {
1123	return GFX90AInsts && !GFX940Insts;
1124	}
1125
1126	// Has one cycle hazard on transcendental instruction feeding a
1127	// non transcendental VALU.
1128	bool hasTransForwardingHazard() const { return GFX940Insts; }
1129
1130	// Has one cycle hazard on a VALU instruction partially writing dst with
1131	// a shift of result bits feeding another VALU instruction.
1132	bool hasDstSelForwardingHazard() const { return GFX940Insts; }
1133
1134	// Cannot use op_sel with v_dot instructions.
1135	bool hasDOTOpSelHazard() const { return GFX940Insts \|\| GFX11Insts; }
1136
1137	// Does not have HW interlocs for VALU writing and then reading SGPRs.
1138	bool hasVDecCoExecHazard() const {
1139	return GFX940Insts;
1140	}
1141
1142	bool hasNSAtoVMEMBug() const {
1143	return HasNSAtoVMEMBug;
1144	}
1145
1146	bool hasNSAClauseBug() const { return HasNSAClauseBug; }
1147
1148	bool hasHardClauses() const { return getGeneration() >= GFX10; }
1149
1150	bool hasGFX90AInsts() const { return GFX90AInsts; }
1151
1152	bool hasFPAtomicToDenormModeHazard() const {
1153	return getGeneration() == GFX10;
1154	}
1155
1156	bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
1157
1158	bool hasLdsDirect() const { return getGeneration() >= GFX11; }
1159
1160	bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; }
1161
1162	bool hasVALUPartialForwardingHazard() const {
1163	return getGeneration() == GFX11;
1164	}
1165
1166	bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; }
1167
1168	bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; }
1169
1170	bool requiresCodeObjectV6() const { return RequiresCOV6; }
1171
1172	bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
1173
1174	/// Return if operations acting on VGPR tuples require even alignment.
1175	bool needsAlignedVGPRs() const { return GFX90AInsts; }
1176
1177	/// Return true if the target has the S_PACK_HL_B32_B16 instruction.
1178	bool hasSPackHL() const { return GFX11Insts; }
1179
1180	/// Return true if the target's EXP instruction has the COMPR flag, which
1181	/// affects the meaning of the EN (enable) bits.
1182	bool hasCompressedExport() const { return !GFX11Insts; }
1183
1184	/// Return true if the target's EXP instruction supports the NULL export
1185	/// target.
1186	bool hasNullExportTarget() const { return !GFX11Insts; }
1187
1188	bool hasGFX11FullVGPRs() const { return HasGFX11FullVGPRs; }
1189
1190	bool hasVOPDInsts() const { return HasVOPDInsts; }
1191
1192	bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
1193
1194	/// Return true if the target has the S_DELAY_ALU instruction.
1195	bool hasDelayAlu() const { return GFX11Insts; }
1196
1197	bool hasPackedTID() const { return HasPackedTID; }
1198
1199	// GFX940 is a derivation to GFX90A. hasGFX940Insts() being true implies that
1200	// hasGFX90AInsts is also true.
1201	bool hasGFX940Insts() const { return GFX940Insts; }
1202
1203	bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }
1204
1205	bool hasVGPRSingleUseHintInsts() const { return HasVGPRSingleUseHintInsts; }
1206
1207	bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }
1208
1209	bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }
1210
1211	/// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
1212	/// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
1213	bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
1214
1215	/// Return the maximum number of waves per SIMD for kernels using \p SGPRs
1216	/// SGPRs
1217	unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
1218
1219	/// Return the maximum number of waves per SIMD for kernels using \p VGPRs
1220	/// VGPRs
1221	unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
1222
1223	/// Return occupancy for the given function. Used LDS and a number of
1224	/// registers if provided.
1225	/// Note, occupancy can be affected by the scratch allocation as well, but
1226	/// we do not have enough information to compute it.
1227	unsigned computeOccupancy(const Function &F, unsigned LDSSize = `0`,
1228	unsigned NumSGPRs = `0`, unsigned NumVGPRs = `0`) const;
1229
1230	/// \returns true if the flat_scratch register should be initialized with the
1231	/// pointer to the wave's scratch memory rather than a size and offset.
1232	bool flatScratchIsPointer() const {
1233	return getGeneration() >= AMDGPUSubtarget::GFX9;
1234	}
1235
1236	/// \returns true if the flat_scratch register is initialized by the HW.
1237	/// In this case it is readonly.
1238	bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
1239
1240	/// \returns true if the architected SGPRs are enabled.
1241	bool hasArchitectedSGPRs() const { return HasArchitectedSGPRs; }
1242
1243	/// \returns true if Global Data Share is supported.
1244	bool hasGDS() const { return HasGDS; }
1245
1246	/// \returns true if Global Wave Sync is supported.
1247	bool hasGWS() const { return HasGWS; }
1248
1249	/// \returns true if the machine has merged shaders in which s0-s7 are
1250	/// reserved by the hardware and user SGPRs start at s8
1251	bool hasMergedShaders() const {
1252	return getGeneration() >= GFX9;
1253	}
1254
1255	// \returns true if the target supports the pre-NGG legacy geometry path.
1256	bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
1257
1258	// \returns true if preloading kernel arguments is supported.
1259	bool hasKernargPreload() const { return KernargPreload; }
1260
1261	// \returns true if the target has split barriers feature
1262	bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
1263
1264	// \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable.
1265	bool hasCvtFP8VOP1Bug() const { return true; }
1266
1267	// \returns true if CSUB (a.k.a. SUB_CLAMP on GFX12) atomics support a
1268	// no-return form.
1269	bool hasAtomicCSubNoRtnInsts() const { return HasAtomicCSubNoRtnInsts; }
1270
1271	// \returns true if the target has DX10_CLAMP kernel descriptor mode bit
1272	bool hasDX10ClampMode() const { return getGeneration() < GFX12; }
1273
1274	// \returns true if the target has IEEE kernel descriptor mode bit
1275	bool hasIEEEMode() const { return getGeneration() < GFX12; }
1276
1277	// \returns true if the target has IEEE fminimum/fmaximum instructions
1278	bool hasIEEEMinMax() const { return getGeneration() >= GFX12; }
1279
1280	// \returns true if the target has WG_RR_MODE kernel descriptor mode bit
1281	bool hasRrWGMode() const { return getGeneration() >= GFX12; }
1282
1283	/// \returns true if VADDR and SADDR fields in VSCRATCH can use negative
1284	/// values.
1285	bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
1286
1287	// \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
1288	// of sign-extending.
1289	bool hasGetPCZeroExtension() const { return GFX12Insts; }
1290
1291	/// \returns SGPR allocation granularity supported by the subtarget.
1292	unsigned getSGPRAllocGranule() const {
1293	return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
1294	}
1295
1296	/// \returns SGPR encoding granularity supported by the subtarget.
1297	unsigned getSGPREncodingGranule() const {
1298	return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
1299	}
1300
1301	/// \returns Total number of SGPRs supported by the subtarget.
1302	unsigned getTotalNumSGPRs() const {
1303	return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
1304	}
1305
1306	/// \returns Addressable number of SGPRs supported by the subtarget.
1307	unsigned getAddressableNumSGPRs() const {
1308	return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
1309	}
1310
1311	/// \returns Minimum number of SGPRs that meets the given number of waves per
1312	/// execution unit requirement supported by the subtarget.
1313	unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
1314	return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
1315	}
1316
1317	/// \returns Maximum number of SGPRs that meets the given number of waves per
1318	/// execution unit requirement supported by the subtarget.
1319	unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
1320	return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
1321	}
1322
1323	/// \returns Reserved number of SGPRs. This is common
1324	/// utility function called by MachineFunction and
1325	/// Function variants of getReservedNumSGPRs.
1326	unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
1327	/// \returns Reserved number of SGPRs for given machine function \p MF.
1328	unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
1329
1330	/// \returns Reserved number of SGPRs for given function \p F.
1331	unsigned getReservedNumSGPRs(const Function &F) const;
1332
1333	/// \returns max num SGPRs. This is the common utility
1334	/// function called by MachineFunction and Function
1335	/// variants of getMaxNumSGPRs.
1336	unsigned getBaseMaxNumSGPRs(const Function &F,
1337	std::pair<unsigned, unsigned> WavesPerEU,
1338	unsigned PreloadedSGPRs,
1339	unsigned ReservedNumSGPRs) const;
1340
1341	/// \returns Maximum number of SGPRs that meets number of waves per execution
1342	/// unit requirement for function \p MF, or number of SGPRs explicitly
1343	/// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
1344	///
1345	/// \returns Value that meets number of waves per execution unit requirement
1346	/// if explicitly requested value cannot be converted to integer, violates
1347	/// subtarget's specifications, or does not meet number of waves per execution
1348	/// unit requirement.
1349	unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
1350
1351	/// \returns Maximum number of SGPRs that meets number of waves per execution
1352	/// unit requirement for function \p F, or number of SGPRs explicitly
1353	/// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
1354	///
1355	/// \returns Value that meets number of waves per execution unit requirement
1356	/// if explicitly requested value cannot be converted to integer, violates
1357	/// subtarget's specifications, or does not meet number of waves per execution
1358	/// unit requirement.
1359	unsigned getMaxNumSGPRs(const Function &F) const;
1360
1361	/// \returns VGPR allocation granularity supported by the subtarget.
1362	unsigned getVGPRAllocGranule() const {
1363	return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
1364	}
1365
1366	/// \returns VGPR encoding granularity supported by the subtarget.
1367	unsigned getVGPREncodingGranule() const {
1368	return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
1369	}
1370
1371	/// \returns Total number of VGPRs supported by the subtarget.
1372	unsigned getTotalNumVGPRs() const {
1373	return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
1374	}
1375
1376	/// \returns Addressable number of VGPRs supported by the subtarget.
1377	unsigned getAddressableNumVGPRs() const {
1378	return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
1379	}
1380
1381	/// \returns the minimum number of VGPRs that will prevent achieving more than
1382	/// the specified number of waves \p WavesPerEU.
1383	unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
1384	return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
1385	}
1386
1387	/// \returns the maximum number of VGPRs that can be used and still achieved
1388	/// at least the specified number of waves \p WavesPerEU.
1389	unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
1390	return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
1391	}
1392
1393	/// \returns max num VGPRs. This is the common utility function
1394	/// called by MachineFunction and Function variants of getMaxNumVGPRs.
1395	unsigned getBaseMaxNumVGPRs(const Function &F,
1396	std::pair<unsigned, unsigned> WavesPerEU) const;
1397	/// \returns Maximum number of VGPRs that meets number of waves per execution
1398	/// unit requirement for function \p F, or number of VGPRs explicitly
1399	/// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
1400	///
1401	/// \returns Value that meets number of waves per execution unit requirement
1402	/// if explicitly requested value cannot be converted to integer, violates
1403	/// subtarget's specifications, or does not meet number of waves per execution
1404	/// unit requirement.
1405	unsigned getMaxNumVGPRs(const Function &F) const;
1406
1407	unsigned getMaxNumAGPRs(const Function &F) const {
1408	return getMaxNumVGPRs(F);
1409	}
1410
1411	/// \returns Maximum number of VGPRs that meets number of waves per execution
1412	/// unit requirement for function \p MF, or number of VGPRs explicitly
1413	/// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
1414	///
1415	/// \returns Value that meets number of waves per execution unit requirement
1416	/// if explicitly requested value cannot be converted to integer, violates
1417	/// subtarget's specifications, or does not meet number of waves per execution
1418	/// unit requirement.
1419	unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
1420
1421	void getPostRAMutations(
1422	std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
1423	const override;
1424
1425	std::unique_ptr<ScheduleDAGMutation>
1426	createFillMFMAShadowMutation(const TargetInstrInfo TII) const*;
1427
1428	bool isWave32() const {
1429	return getWavefrontSize() == `32`;
1430	}
1431
1432	bool isWave64() const {
1433	return getWavefrontSize() == `64`;
1434	}
1435
1436	const TargetRegisterClass getBoolRC() const* {
1437	return getRegisterInfo()->getBoolRC();
1438	}
1439
1440	/// \returns Maximum number of work groups per compute unit supported by the
1441	/// subtarget and limited by given \p FlatWorkGroupSize.
1442	unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
1443	return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
1444	}
1445
1446	/// \returns Minimum flat work group size supported by the subtarget.
1447	unsigned getMinFlatWorkGroupSize() const override {
1448	return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
1449	}
1450
1451	/// \returns Maximum flat work group size supported by the subtarget.
1452	unsigned getMaxFlatWorkGroupSize() const override {
1453	return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
1454	}
1455
1456	/// \returns Number of waves per execution unit required to support the given
1457	/// \p FlatWorkGroupSize.
1458	unsigned
1459	getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
1460	return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
1461	}
1462
1463	/// \returns Minimum number of waves per execution unit supported by the
1464	/// subtarget.
1465	unsigned getMinWavesPerEU() const override {
1466	return AMDGPU::IsaInfo::getMinWavesPerEU(this);
1467	}
1468
1469	void adjustSchedDependency(SUnit Def, int* DefOpIdx, SUnit Use, int* UseOpIdx,
1470	SDep &Dep) const override;
1471
1472	// \returns true if it's beneficial on this subtarget for the scheduler to
1473	// cluster stores as well as loads.
1474	bool shouldClusterStores() const { return getGeneration() >= GFX11; }
1475
1476	// \returns the number of address arguments from which to enable MIMG NSA
1477	// on supported architectures.
1478	unsigned getNSAThreshold(const MachineFunction &MF) const;
1479
1480	// \returns true if the subtarget has a hazard requiring an "s_nop 0"
1481	// instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
1482	bool requiresNopBeforeDeallocVGPRs() const {
1483	// Currently all targets that support the dealloc VGPRs message also require
1484	// the nop.
1485	return true;
1486	}
1487	};
1488
1489	class GCNUserSGPRUsageInfo {
1490	public:
1491	bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; }
1492
1493	bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; }
1494
1495	bool hasDispatchPtr() const { return DispatchPtr; }
1496
1497	bool hasQueuePtr() const { return QueuePtr; }
1498
1499	bool hasKernargSegmentPtr() const { return KernargSegmentPtr; }
1500
1501	bool hasDispatchID() const { return DispatchID; }
1502
1503	bool hasFlatScratchInit() const { return FlatScratchInit; }
1504
1505	unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }
1506
1507	unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }
1508
1509	unsigned getNumFreeUserSGPRs();
1510
1511	void allocKernargPreloadSGPRs(unsigned NumSGPRs);
1512
1513	enum UserSGPRID : unsigned {
1514	ImplicitBufferPtrID = `0`,
1515	PrivateSegmentBufferID = `1`,
1516	DispatchPtrID = `2`,
1517	QueuePtrID = `3`,
1518	KernargSegmentPtrID = `4`,
1519	DispatchIdID = `5`,
1520	FlatScratchInitID = `6`,
1521	PrivateSegmentSizeID = `7`
1522	};
1523
1524	// Returns the size in number of SGPRs for preload user SGPR field.
1525	static unsigned getNumUserSGPRForField(UserSGPRID ID) {
1526	switch (ID) {
1527	case ImplicitBufferPtrID:
1528	return `2`;
1529	case PrivateSegmentBufferID:
1530	return `4`;
1531	case DispatchPtrID:
1532	return `2`;
1533	case QueuePtrID:
1534	return `2`;
1535	case KernargSegmentPtrID:
1536	return `2`;
1537	case DispatchIdID:
1538	return `2`;
1539	case FlatScratchInitID:
1540	return `2`;
1541	case PrivateSegmentSizeID:
1542	return `1`;
1543	}
1544	llvm_unreachable("Unknown UserSGPRID.");
1545	}
1546
1547	GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST);
1548
1549	private:
1550	const GCNSubtarget &ST;
1551
1552	// Private memory buffer
1553	// Compute directly in sgpr[0:1]
1554	// Other shaders indirect 64-bits at sgpr[0:1]
1555	bool ImplicitBufferPtr = false;
1556
1557	bool PrivateSegmentBuffer = false;
1558
1559	bool DispatchPtr = false;
1560
1561	bool QueuePtr = false;
1562
1563	bool KernargSegmentPtr = false;
1564
1565	bool DispatchID = false;
1566
1567	bool FlatScratchInit = false;
1568
1569	unsigned NumKernargPreloadSGPRs = `0`;
1570
1571	unsigned NumUsedUserSGPRs = `0`;
1572	};
1573
1574	} // end namespace llvm
1575
1576	#endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
1577

source code of llvm/lib/Target/AMDGPU/GCNSubtarget.h