1//=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// AMD GCN specific subclass of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
15#define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
16
17#include "AMDGPUCallLowering.h"
18#include "AMDGPURegisterBankInfo.h"
19#include "AMDGPUSubtarget.h"
20#include "SIFrameLowering.h"
21#include "SIISelLowering.h"
22#include "SIInstrInfo.h"
23#include "Utils/AMDGPUBaseInfo.h"
24#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
25#include "llvm/Support/ErrorHandling.h"
26
27#define GET_SUBTARGETINFO_HEADER
28#include "AMDGPUGenSubtargetInfo.inc"
29
30namespace llvm {
31
32class GCNTargetMachine;
33
34class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
35 public AMDGPUSubtarget {
36public:
37 using AMDGPUSubtarget::getMaxWavesPerEU;
38
39 // Following 2 enums are documented at:
40 // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
41 enum class TrapHandlerAbi {
42 NONE = 0x00,
43 AMDHSA = 0x01,
44 };
45
46 enum class TrapID {
47 LLVMAMDHSATrap = 0x02,
48 LLVMAMDHSADebugTrap = 0x03,
49 };
50
51private:
52 /// GlobalISel related APIs.
53 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
54 std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
55 std::unique_ptr<InstructionSelector> InstSelector;
56 std::unique_ptr<LegalizerInfo> Legalizer;
57 std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo;
58
59protected:
60 // Basic subtarget description.
61 Triple TargetTriple;
62 AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
63 unsigned Gen = INVALID;
64 InstrItineraryData InstrItins;
65 int LDSBankCount = 0;
66 unsigned MaxPrivateElementSize = 0;
67
68 // Possibly statically set by tablegen, but may want to be overridden.
69 bool FastDenormalF32 = false;
70 bool HalfRate64Ops = false;
71 bool FullRate64Ops = false;
72
73 // Dynamically set bits that enable features.
74 bool FlatForGlobal = false;
75 bool AutoWaitcntBeforeBarrier = false;
76 bool BackOffBarrier = false;
77 bool UnalignedScratchAccess = false;
78 bool UnalignedAccessMode = false;
79 bool HasApertureRegs = false;
80 bool SupportsXNACK = false;
81 bool KernargPreload = false;
82
83 // This should not be used directly. 'TargetID' tracks the dynamic settings
84 // for XNACK.
85 bool EnableXNACK = false;
86
87 bool EnableTgSplit = false;
88 bool EnableCuMode = false;
89 bool TrapHandler = false;
90
91 // Used as options.
92 bool EnableLoadStoreOpt = false;
93 bool EnableUnsafeDSOffsetFolding = false;
94 bool EnableSIScheduler = false;
95 bool EnableDS128 = false;
96 bool EnablePRTStrictNull = false;
97 bool DumpCode = false;
98
99 // Subtarget statically properties set by tablegen
100 bool FP64 = false;
101 bool FMA = false;
102 bool MIMG_R128 = false;
103 bool CIInsts = false;
104 bool GFX8Insts = false;
105 bool GFX9Insts = false;
106 bool GFX90AInsts = false;
107 bool GFX940Insts = false;
108 bool GFX10Insts = false;
109 bool GFX11Insts = false;
110 bool GFX12Insts = false;
111 bool GFX10_3Insts = false;
112 bool GFX7GFX8GFX9Insts = false;
113 bool SGPRInitBug = false;
114 bool UserSGPRInit16Bug = false;
115 bool NegativeScratchOffsetBug = false;
116 bool NegativeUnalignedScratchOffsetBug = false;
117 bool HasSMemRealTime = false;
118 bool HasIntClamp = false;
119 bool HasFmaMixInsts = false;
120 bool HasMovrel = false;
121 bool HasVGPRIndexMode = false;
122 bool HasScalarDwordx3Loads = false;
123 bool HasScalarStores = false;
124 bool HasScalarAtomics = false;
125 bool HasSDWAOmod = false;
126 bool HasSDWAScalar = false;
127 bool HasSDWASdst = false;
128 bool HasSDWAMac = false;
129 bool HasSDWAOutModsVOPC = false;
130 bool HasDPP = false;
131 bool HasDPP8 = false;
132 bool HasDPALU_DPP = false;
133 bool HasDPPSrc1SGPR = false;
134 bool HasPackedFP32Ops = false;
135 bool HasImageInsts = false;
136 bool HasExtendedImageInsts = false;
137 bool HasR128A16 = false;
138 bool HasA16 = false;
139 bool HasG16 = false;
140 bool HasNSAEncoding = false;
141 bool HasPartialNSAEncoding = false;
142 bool GFX10_AEncoding = false;
143 bool GFX10_BEncoding = false;
144 bool HasDLInsts = false;
145 bool HasFmacF64Inst = false;
146 bool HasDot1Insts = false;
147 bool HasDot2Insts = false;
148 bool HasDot3Insts = false;
149 bool HasDot4Insts = false;
150 bool HasDot5Insts = false;
151 bool HasDot6Insts = false;
152 bool HasDot7Insts = false;
153 bool HasDot8Insts = false;
154 bool HasDot9Insts = false;
155 bool HasDot10Insts = false;
156 bool HasMAIInsts = false;
157 bool HasFP8Insts = false;
158 bool HasFP8ConversionInsts = false;
159 bool HasPkFmacF16Inst = false;
160 bool HasAtomicDsPkAdd16Insts = false;
161 bool HasAtomicFlatPkAdd16Insts = false;
162 bool HasAtomicFaddRtnInsts = false;
163 bool HasAtomicFaddNoRtnInsts = false;
164 bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false;
165 bool HasAtomicBufferGlobalPkAddF16Insts = false;
166 bool HasAtomicCSubNoRtnInsts = false;
167 bool HasAtomicGlobalPkAddBF16Inst = false;
168 bool HasFlatAtomicFaddF32Inst = false;
169 bool HasDefaultComponentZero = false;
170 bool HasDefaultComponentBroadcast = false;
171 bool SupportsSRAMECC = false;
172
173 // This should not be used directly. 'TargetID' tracks the dynamic settings
174 // for SRAMECC.
175 bool EnableSRAMECC = false;
176
177 bool HasNoSdstCMPX = false;
178 bool HasVscnt = false;
179 bool HasGetWaveIdInst = false;
180 bool HasSMemTimeInst = false;
181 bool HasShaderCyclesRegister = false;
182 bool HasShaderCyclesHiLoRegisters = false;
183 bool HasVOP3Literal = false;
184 bool HasNoDataDepHazard = false;
185 bool FlatAddressSpace = false;
186 bool FlatInstOffsets = false;
187 bool FlatGlobalInsts = false;
188 bool FlatScratchInsts = false;
189 bool ScalarFlatScratchInsts = false;
190 bool HasArchitectedFlatScratch = false;
191 bool EnableFlatScratch = false;
192 bool HasArchitectedSGPRs = false;
193 bool HasGDS = false;
194 bool HasGWS = false;
195 bool AddNoCarryInsts = false;
196 bool HasUnpackedD16VMem = false;
197 bool LDSMisalignedBug = false;
198 bool HasMFMAInlineLiteralBug = false;
199 bool UnalignedBufferAccess = false;
200 bool UnalignedDSAccess = false;
201 bool HasPackedTID = false;
202 bool ScalarizeGlobal = false;
203 bool HasSALUFloatInsts = false;
204 bool HasVGPRSingleUseHintInsts = false;
205 bool HasPseudoScalarTrans = false;
206 bool HasRestrictedSOffset = false;
207
208 bool HasVcmpxPermlaneHazard = false;
209 bool HasVMEMtoScalarWriteHazard = false;
210 bool HasSMEMtoVectorWriteHazard = false;
211 bool HasInstFwdPrefetchBug = false;
212 bool HasVcmpxExecWARHazard = false;
213 bool HasLdsBranchVmemWARHazard = false;
214 bool HasNSAtoVMEMBug = false;
215 bool HasNSAClauseBug = false;
216 bool HasOffset3fBug = false;
217 bool HasFlatSegmentOffsetBug = false;
218 bool HasImageStoreD16Bug = false;
219 bool HasImageGather4D16Bug = false;
220 bool HasMSAALoadDstSelBug = false;
221 bool HasGFX11FullVGPRs = false;
222 bool HasMADIntraFwdBug = false;
223 bool HasVOPDInsts = false;
224 bool HasVALUTransUseHazard = false;
225 bool HasForceStoreSC0SC1 = false;
226
227 bool RequiresCOV6 = false;
228
229 // Dummy feature to use for assembler in tablegen.
230 bool FeatureDisable = false;
231
232 SelectionDAGTargetInfo TSInfo;
233private:
234 SIInstrInfo InstrInfo;
235 SITargetLowering TLInfo;
236 SIFrameLowering FrameLowering;
237
238public:
239 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
240 const GCNTargetMachine &TM);
241 ~GCNSubtarget() override;
242
243 GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
244 StringRef GPU, StringRef FS);
245
246 const SIInstrInfo *getInstrInfo() const override {
247 return &InstrInfo;
248 }
249
250 const SIFrameLowering *getFrameLowering() const override {
251 return &FrameLowering;
252 }
253
254 const SITargetLowering *getTargetLowering() const override {
255 return &TLInfo;
256 }
257
258 const SIRegisterInfo *getRegisterInfo() const override {
259 return &InstrInfo.getRegisterInfo();
260 }
261
262 const CallLowering *getCallLowering() const override {
263 return CallLoweringInfo.get();
264 }
265
266 const InlineAsmLowering *getInlineAsmLowering() const override {
267 return InlineAsmLoweringInfo.get();
268 }
269
270 InstructionSelector *getInstructionSelector() const override {
271 return InstSelector.get();
272 }
273
274 const LegalizerInfo *getLegalizerInfo() const override {
275 return Legalizer.get();
276 }
277
278 const AMDGPURegisterBankInfo *getRegBankInfo() const override {
279 return RegBankInfo.get();
280 }
281
282 const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const {
283 return TargetID;
284 }
285
286 // Nothing implemented, just prevent crashes on use.
287 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
288 return &TSInfo;
289 }
290
291 const InstrItineraryData *getInstrItineraryData() const override {
292 return &InstrItins;
293 }
294
295 void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
296
297 Generation getGeneration() const {
298 return (Generation)Gen;
299 }
300
301 unsigned getMaxWaveScratchSize() const {
302 // See COMPUTE_TMPRING_SIZE.WAVESIZE.
303 if (getGeneration() >= GFX12) {
304 // 18-bit field in units of 64-dword.
305 return (64 * 4) * ((1 << 18) - 1);
306 }
307 if (getGeneration() == GFX11) {
308 // 15-bit field in units of 64-dword.
309 return (64 * 4) * ((1 << 15) - 1);
310 }
311 // 13-bit field in units of 256-dword.
312 return (256 * 4) * ((1 << 13) - 1);
313 }
314
315 /// Return the number of high bits known to be zero for a frame index.
316 unsigned getKnownHighZeroBitsForFrameIndex() const {
317 return llvm::countl_zero(getMaxWaveScratchSize()) + getWavefrontSizeLog2();
318 }
319
320 int getLDSBankCount() const {
321 return LDSBankCount;
322 }
323
324 unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
325 return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16;
326 }
327
328 unsigned getConstantBusLimit(unsigned Opcode) const;
329
330 /// Returns if the result of this instruction with a 16-bit result returned in
331 /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
332 /// the original value.
333 bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
334
335 bool supportsWGP() const { return getGeneration() >= GFX10; }
336
337 bool hasIntClamp() const {
338 return HasIntClamp;
339 }
340
341 bool hasFP64() const {
342 return FP64;
343 }
344
345 bool hasMIMG_R128() const {
346 return MIMG_R128;
347 }
348
349 bool hasHWFP64() const {
350 return FP64;
351 }
352
353 bool hasHalfRate64Ops() const {
354 return HalfRate64Ops;
355 }
356
357 bool hasFullRate64Ops() const {
358 return FullRate64Ops;
359 }
360
361 bool hasAddr64() const {
362 return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
363 }
364
365 bool hasFlat() const {
366 return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS);
367 }
368
369 // Return true if the target only has the reverse operand versions of VALU
370 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
371 bool hasOnlyRevVALUShifts() const {
372 return getGeneration() >= VOLCANIC_ISLANDS;
373 }
374
375 bool hasFractBug() const {
376 return getGeneration() == SOUTHERN_ISLANDS;
377 }
378
379 bool hasBFE() const {
380 return true;
381 }
382
383 bool hasBFI() const {
384 return true;
385 }
386
387 bool hasBFM() const {
388 return hasBFE();
389 }
390
391 bool hasBCNT(unsigned Size) const {
392 return true;
393 }
394
395 bool hasFFBL() const {
396 return true;
397 }
398
399 bool hasFFBH() const {
400 return true;
401 }
402
403 bool hasMed3_16() const {
404 return getGeneration() >= AMDGPUSubtarget::GFX9;
405 }
406
407 bool hasMin3Max3_16() const {
408 return getGeneration() >= AMDGPUSubtarget::GFX9;
409 }
410
411 bool hasFmaMixInsts() const {
412 return HasFmaMixInsts;
413 }
414
415 bool hasCARRY() const {
416 return true;
417 }
418
419 bool hasFMA() const {
420 return FMA;
421 }
422
423 bool hasSwap() const {
424 return GFX9Insts;
425 }
426
427 bool hasScalarPackInsts() const {
428 return GFX9Insts;
429 }
430
431 bool hasScalarMulHiInsts() const {
432 return GFX9Insts;
433 }
434
435 bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
436
437 TrapHandlerAbi getTrapHandlerAbi() const {
438 return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
439 }
440
441 bool supportsGetDoorbellID() const {
442 // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
443 return getGeneration() >= GFX9;
444 }
445
446 /// True if the offset field of DS instructions works as expected. On SI, the
447 /// offset uses a 16-bit adder and does not always wrap properly.
448 bool hasUsableDSOffset() const {
449 return getGeneration() >= SEA_ISLANDS;
450 }
451
452 bool unsafeDSOffsetFoldingEnabled() const {
453 return EnableUnsafeDSOffsetFolding;
454 }
455
456 /// Condition output from div_scale is usable.
457 bool hasUsableDivScaleConditionOutput() const {
458 return getGeneration() != SOUTHERN_ISLANDS;
459 }
460
461 /// Extra wait hazard is needed in some cases before
462 /// s_cbranch_vccnz/s_cbranch_vccz.
463 bool hasReadVCCZBug() const {
464 return getGeneration() <= SEA_ISLANDS;
465 }
466
467 /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
468 bool partialVCCWritesUpdateVCCZ() const {
469 return getGeneration() >= GFX10;
470 }
471
472 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
473 /// was written by a VALU instruction.
474 bool hasSMRDReadVALUDefHazard() const {
475 return getGeneration() == SOUTHERN_ISLANDS;
476 }
477
478 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
479 /// SGPR was written by a VALU Instruction.
480 bool hasVMEMReadSGPRVALUDefHazard() const {
481 return getGeneration() >= VOLCANIC_ISLANDS;
482 }
483
484 bool hasRFEHazards() const {
485 return getGeneration() >= VOLCANIC_ISLANDS;
486 }
487
488 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
489 unsigned getSetRegWaitStates() const {
490 return getGeneration() <= SEA_ISLANDS ? 1 : 2;
491 }
492
493 bool dumpCode() const {
494 return DumpCode;
495 }
496
497 /// Return the amount of LDS that can be used that will not restrict the
498 /// occupancy lower than WaveCount.
499 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
500 const Function &) const;
501
502 bool supportsMinMaxDenormModes() const {
503 return getGeneration() >= AMDGPUSubtarget::GFX9;
504 }
505
506 /// \returns If target supports S_DENORM_MODE.
507 bool hasDenormModeInst() const {
508 return getGeneration() >= AMDGPUSubtarget::GFX10;
509 }
510
511 bool useFlatForGlobal() const {
512 return FlatForGlobal;
513 }
514
515 /// \returns If target supports ds_read/write_b128 and user enables generation
516 /// of ds_read/write_b128.
517 bool useDS128() const {
518 return CIInsts && EnableDS128;
519 }
520
521 /// \return If target supports ds_read/write_b96/128.
522 bool hasDS96AndDS128() const {
523 return CIInsts;
524 }
525
526 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
527 bool haveRoundOpsF64() const {
528 return CIInsts;
529 }
530
531 /// \returns If MUBUF instructions always perform range checking, even for
532 /// buffer resources used for private memory access.
533 bool privateMemoryResourceIsRangeChecked() const {
534 return getGeneration() < AMDGPUSubtarget::GFX9;
535 }
536
537 /// \returns If target requires PRT Struct NULL support (zero result registers
538 /// for sparse texture support).
539 bool usePRTStrictNull() const {
540 return EnablePRTStrictNull;
541 }
542
543 bool hasAutoWaitcntBeforeBarrier() const {
544 return AutoWaitcntBeforeBarrier;
545 }
546
547 /// \returns true if the target supports backing off of s_barrier instructions
548 /// when an exception is raised.
549 bool supportsBackOffBarrier() const {
550 return BackOffBarrier;
551 }
552
553 bool hasUnalignedBufferAccess() const {
554 return UnalignedBufferAccess;
555 }
556
557 bool hasUnalignedBufferAccessEnabled() const {
558 return UnalignedBufferAccess && UnalignedAccessMode;
559 }
560
561 bool hasUnalignedDSAccess() const {
562 return UnalignedDSAccess;
563 }
564
565 bool hasUnalignedDSAccessEnabled() const {
566 return UnalignedDSAccess && UnalignedAccessMode;
567 }
568
569 bool hasUnalignedScratchAccess() const {
570 return UnalignedScratchAccess;
571 }
572
573 bool hasUnalignedAccessMode() const {
574 return UnalignedAccessMode;
575 }
576
577 bool hasApertureRegs() const {
578 return HasApertureRegs;
579 }
580
581 bool isTrapHandlerEnabled() const {
582 return TrapHandler;
583 }
584
585 bool isXNACKEnabled() const {
586 return TargetID.isXnackOnOrAny();
587 }
588
589 bool isTgSplitEnabled() const {
590 return EnableTgSplit;
591 }
592
593 bool isCuModeEnabled() const {
594 return EnableCuMode;
595 }
596
597 bool hasFlatAddressSpace() const {
598 return FlatAddressSpace;
599 }
600
601 bool hasFlatScrRegister() const {
602 return hasFlatAddressSpace();
603 }
604
605 bool hasFlatInstOffsets() const {
606 return FlatInstOffsets;
607 }
608
609 bool hasFlatGlobalInsts() const {
610 return FlatGlobalInsts;
611 }
612
613 bool hasFlatScratchInsts() const {
614 return FlatScratchInsts;
615 }
616
617 // Check if target supports ST addressing mode with FLAT scratch instructions.
618 // The ST addressing mode means no registers are used, either VGPR or SGPR,
619 // but only immediate offset is swizzled and added to the FLAT scratch base.
620 bool hasFlatScratchSTMode() const {
621 return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
622 }
623
624 bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; }
625
626 bool hasScalarFlatScratchInsts() const {
627 return ScalarFlatScratchInsts;
628 }
629
630 bool enableFlatScratch() const {
631 return flatScratchIsArchitected() ||
632 (EnableFlatScratch && hasFlatScratchInsts());
633 }
634
635 bool hasGlobalAddTidInsts() const {
636 return GFX10_BEncoding;
637 }
638
639 bool hasAtomicCSub() const {
640 return GFX10_BEncoding;
641 }
642
643 // BUFFER/FLAT/GLOBAL_ATOMIC_ADD/MIN/MAX_F64
644 bool hasBufferFlatGlobalAtomicsF64() const { return hasGFX90AInsts(); }
645
646 // DS_ADD_F64/DS_ADD_RTN_F64
647 bool hasLdsAtomicAddF64() const { return hasGFX90AInsts(); }
648
649 bool hasMultiDwordFlatScratchAddressing() const {
650 return getGeneration() >= GFX9;
651 }
652
653 bool hasFlatSegmentOffsetBug() const {
654 return HasFlatSegmentOffsetBug;
655 }
656
657 bool hasFlatLgkmVMemCountInOrder() const {
658 return getGeneration() > GFX9;
659 }
660
661 bool hasD16LoadStore() const {
662 return getGeneration() >= GFX9;
663 }
664
665 bool d16PreservesUnusedBits() const {
666 return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
667 }
668
669 bool hasD16Images() const {
670 return getGeneration() >= VOLCANIC_ISLANDS;
671 }
672
673 /// Return if most LDS instructions have an m0 use that require m0 to be
674 /// initialized.
675 bool ldsRequiresM0Init() const {
676 return getGeneration() < GFX9;
677 }
678
679 // True if the hardware rewinds and replays GWS operations if a wave is
680 // preempted.
681 //
682 // If this is false, a GWS operation requires testing if a nack set the
683 // MEM_VIOL bit, and repeating if so.
684 bool hasGWSAutoReplay() const {
685 return getGeneration() >= GFX9;
686 }
687
688 /// \returns if target has ds_gws_sema_release_all instruction.
689 bool hasGWSSemaReleaseAll() const {
690 return CIInsts;
691 }
692
693 /// \returns true if the target has integer add/sub instructions that do not
694 /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
695 /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
696 /// for saturation.
697 bool hasAddNoCarry() const {
698 return AddNoCarryInsts;
699 }
700
701 bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
702
703 bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
704
705 bool hasUnpackedD16VMem() const {
706 return HasUnpackedD16VMem;
707 }
708
709 // Covers VS/PS/CS graphics shaders
710 bool isMesaGfxShader(const Function &F) const {
711 return isMesa3DOS() && AMDGPU::isShader(CC: F.getCallingConv());
712 }
713
714 bool hasMad64_32() const {
715 return getGeneration() >= SEA_ISLANDS;
716 }
717
718 bool hasSDWAOmod() const {
719 return HasSDWAOmod;
720 }
721
722 bool hasSDWAScalar() const {
723 return HasSDWAScalar;
724 }
725
726 bool hasSDWASdst() const {
727 return HasSDWASdst;
728 }
729
730 bool hasSDWAMac() const {
731 return HasSDWAMac;
732 }
733
734 bool hasSDWAOutModsVOPC() const {
735 return HasSDWAOutModsVOPC;
736 }
737
738 bool hasDLInsts() const {
739 return HasDLInsts;
740 }
741
742 bool hasFmacF64Inst() const { return HasFmacF64Inst; }
743
744 bool hasDot1Insts() const {
745 return HasDot1Insts;
746 }
747
748 bool hasDot2Insts() const {
749 return HasDot2Insts;
750 }
751
752 bool hasDot3Insts() const {
753 return HasDot3Insts;
754 }
755
756 bool hasDot4Insts() const {
757 return HasDot4Insts;
758 }
759
760 bool hasDot5Insts() const {
761 return HasDot5Insts;
762 }
763
764 bool hasDot6Insts() const {
765 return HasDot6Insts;
766 }
767
768 bool hasDot7Insts() const {
769 return HasDot7Insts;
770 }
771
772 bool hasDot8Insts() const {
773 return HasDot8Insts;
774 }
775
776 bool hasDot9Insts() const {
777 return HasDot9Insts;
778 }
779
780 bool hasDot10Insts() const {
781 return HasDot10Insts;
782 }
783
784 bool hasMAIInsts() const {
785 return HasMAIInsts;
786 }
787
788 bool hasFP8Insts() const {
789 return HasFP8Insts;
790 }
791
792 bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; }
793
794 bool hasPkFmacF16Inst() const {
795 return HasPkFmacF16Inst;
796 }
797
798 bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; }
799
800 bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; }
801
802 bool hasAtomicFaddInsts() const {
803 return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
804 }
805
806 bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; }
807
808 bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; }
809
810 bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const {
811 return HasAtomicBufferGlobalPkAddF16NoRtnInsts;
812 }
813
814 bool hasAtomicBufferGlobalPkAddF16Insts() const {
815 return HasAtomicBufferGlobalPkAddF16Insts;
816 }
817
818 bool hasAtomicGlobalPkAddBF16Inst() const {
819 return HasAtomicGlobalPkAddBF16Inst;
820 }
821
822 bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
823
824 bool hasDefaultComponentZero() const { return HasDefaultComponentZero; }
825
826 bool hasDefaultComponentBroadcast() const {
827 return HasDefaultComponentBroadcast;
828 }
829
830 bool hasNoSdstCMPX() const {
831 return HasNoSdstCMPX;
832 }
833
834 bool hasVscnt() const {
835 return HasVscnt;
836 }
837
838 bool hasGetWaveIdInst() const {
839 return HasGetWaveIdInst;
840 }
841
842 bool hasSMemTimeInst() const {
843 return HasSMemTimeInst;
844 }
845
846 bool hasShaderCyclesRegister() const {
847 return HasShaderCyclesRegister;
848 }
849
850 bool hasShaderCyclesHiLoRegisters() const {
851 return HasShaderCyclesHiLoRegisters;
852 }
853
854 bool hasVOP3Literal() const {
855 return HasVOP3Literal;
856 }
857
858 bool hasNoDataDepHazard() const {
859 return HasNoDataDepHazard;
860 }
861
862 bool vmemWriteNeedsExpWaitcnt() const {
863 return getGeneration() < SEA_ISLANDS;
864 }
865
866 bool hasInstPrefetch() const {
867 return getGeneration() == GFX10 || getGeneration() == GFX11;
868 }
869
870 bool hasPrefetch() const { return GFX12Insts; }
871
872 // Has s_cmpk_* instructions.
873 bool hasSCmpK() const { return getGeneration() < GFX12; }
874
875 // Scratch is allocated in 256 dword per wave blocks for the entire
876 // wavefront. When viewed from the perspective of an arbitrary workitem, this
877 // is 4-byte aligned.
878 //
879 // Only 4-byte alignment is really needed to access anything. Transformations
880 // on the pointer value itself may rely on the alignment / known low bits of
881 // the pointer. Set this to something above the minimum to avoid needing
882 // dynamic realignment in common cases.
883 Align getStackAlignment() const { return Align(16); }
884
885 bool enableMachineScheduler() const override {
886 return true;
887 }
888
889 bool useAA() const override;
890
891 bool enableSubRegLiveness() const override {
892 return true;
893 }
894
895 void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
896 bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
897
898 // static wrappers
899 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
900
901 // XXX - Why is this here if it isn't in the default pass set?
902 bool enableEarlyIfConversion() const override {
903 return true;
904 }
905
906 void overrideSchedPolicy(MachineSchedPolicy &Policy,
907 unsigned NumRegionInstrs) const override;
908
909 unsigned getMaxNumUserSGPRs() const {
910 return AMDGPU::getMaxNumUserSGPRs(*this);
911 }
912
913 bool hasSMemRealTime() const {
914 return HasSMemRealTime;
915 }
916
917 bool hasMovrel() const {
918 return HasMovrel;
919 }
920
921 bool hasVGPRIndexMode() const {
922 return HasVGPRIndexMode;
923 }
924
925 bool useVGPRIndexMode() const;
926
927 bool hasScalarCompareEq64() const {
928 return getGeneration() >= VOLCANIC_ISLANDS;
929 }
930
931 bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; }
932
933 bool hasScalarStores() const {
934 return HasScalarStores;
935 }
936
937 bool hasScalarAtomics() const {
938 return HasScalarAtomics;
939 }
940
941 bool hasLDSFPAtomicAdd() const { return GFX8Insts; }
942
943 /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
944 bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
945
946 /// \returns true if the subtarget has the v_permlane64_b32 instruction.
947 bool hasPermLane64() const { return getGeneration() >= GFX11; }
948
949 bool hasDPP() const {
950 return HasDPP;
951 }
952
953 bool hasDPPBroadcasts() const {
954 return HasDPP && getGeneration() < GFX10;
955 }
956
957 bool hasDPPWavefrontShifts() const {
958 return HasDPP && getGeneration() < GFX10;
959 }
960
961 bool hasDPP8() const {
962 return HasDPP8;
963 }
964
965 bool hasDPALU_DPP() const {
966 return HasDPALU_DPP;
967 }
968
969 bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; }
970
971 bool hasPackedFP32Ops() const {
972 return HasPackedFP32Ops;
973 }
974
975 // Has V_PK_MOV_B32 opcode
976 bool hasPkMovB32() const {
977 return GFX90AInsts;
978 }
979
980 bool hasFmaakFmamkF32Insts() const {
981 return getGeneration() >= GFX10 || hasGFX940Insts();
982 }
983
984 bool hasImageInsts() const {
985 return HasImageInsts;
986 }
987
988 bool hasExtendedImageInsts() const {
989 return HasExtendedImageInsts;
990 }
991
992 bool hasR128A16() const {
993 return HasR128A16;
994 }
995
996 bool hasA16() const { return HasA16; }
997
998 bool hasG16() const { return HasG16; }
999
1000 bool hasOffset3fBug() const {
1001 return HasOffset3fBug;
1002 }
1003
1004 bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
1005
1006 bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }
1007
1008 bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; }
1009
1010 bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; }
1011
1012 bool hasNSAEncoding() const { return HasNSAEncoding; }
1013
1014 bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
1015
1016 bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; }
1017
1018 unsigned getNSAMaxSize(bool HasSampler = false) const {
1019 return AMDGPU::getNSAMaxSize(*this, HasSampler);
1020 }
1021
1022 bool hasGFX10_AEncoding() const {
1023 return GFX10_AEncoding;
1024 }
1025
1026 bool hasGFX10_BEncoding() const {
1027 return GFX10_BEncoding;
1028 }
1029
1030 bool hasGFX10_3Insts() const {
1031 return GFX10_3Insts;
1032 }
1033
1034 bool hasMadF16() const;
1035
1036 bool hasMovB64() const { return GFX940Insts; }
1037
1038 bool hasLshlAddB64() const { return GFX940Insts; }
1039
1040 bool enableSIScheduler() const {
1041 return EnableSIScheduler;
1042 }
1043
1044 bool loadStoreOptEnabled() const {
1045 return EnableLoadStoreOpt;
1046 }
1047
1048 bool hasSGPRInitBug() const {
1049 return SGPRInitBug;
1050 }
1051
1052 bool hasUserSGPRInit16Bug() const {
1053 return UserSGPRInit16Bug && isWave32();
1054 }
1055
1056 bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
1057
1058 bool hasNegativeUnalignedScratchOffsetBug() const {
1059 return NegativeUnalignedScratchOffsetBug;
1060 }
1061
1062 bool hasMFMAInlineLiteralBug() const {
1063 return HasMFMAInlineLiteralBug;
1064 }
1065
1066 bool has12DWordStoreHazard() const {
1067 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
1068 }
1069
1070 // \returns true if the subtarget supports DWORDX3 load/store instructions.
1071 bool hasDwordx3LoadStores() const {
1072 return CIInsts;
1073 }
1074
1075 bool hasReadM0MovRelInterpHazard() const {
1076 return getGeneration() == AMDGPUSubtarget::GFX9;
1077 }
1078
1079 bool hasReadM0SendMsgHazard() const {
1080 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1081 getGeneration() <= AMDGPUSubtarget::GFX9;
1082 }
1083
1084 bool hasReadM0LdsDmaHazard() const {
1085 return getGeneration() == AMDGPUSubtarget::GFX9;
1086 }
1087
1088 bool hasReadM0LdsDirectHazard() const {
1089 return getGeneration() == AMDGPUSubtarget::GFX9;
1090 }
1091
1092 bool hasVcmpxPermlaneHazard() const {
1093 return HasVcmpxPermlaneHazard;
1094 }
1095
1096 bool hasVMEMtoScalarWriteHazard() const {
1097 return HasVMEMtoScalarWriteHazard;
1098 }
1099
1100 bool hasSMEMtoVectorWriteHazard() const {
1101 return HasSMEMtoVectorWriteHazard;
1102 }
1103
1104 bool hasLDSMisalignedBug() const {
1105 return LDSMisalignedBug && !EnableCuMode;
1106 }
1107
1108 bool hasInstFwdPrefetchBug() const {
1109 return HasInstFwdPrefetchBug;
1110 }
1111
1112 bool hasVcmpxExecWARHazard() const {
1113 return HasVcmpxExecWARHazard;
1114 }
1115
1116 bool hasLdsBranchVmemWARHazard() const {
1117 return HasLdsBranchVmemWARHazard;
1118 }
1119
1120 // Shift amount of a 64 bit shift cannot be a highest allocated register
1121 // if also at the end of the allocation block.
1122 bool hasShift64HighRegBug() const {
1123 return GFX90AInsts && !GFX940Insts;
1124 }
1125
1126 // Has one cycle hazard on transcendental instruction feeding a
1127 // non transcendental VALU.
1128 bool hasTransForwardingHazard() const { return GFX940Insts; }
1129
1130 // Has one cycle hazard on a VALU instruction partially writing dst with
1131 // a shift of result bits feeding another VALU instruction.
1132 bool hasDstSelForwardingHazard() const { return GFX940Insts; }
1133
1134 // Cannot use op_sel with v_dot instructions.
1135 bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; }
1136
1137 // Does not have HW interlocs for VALU writing and then reading SGPRs.
1138 bool hasVDecCoExecHazard() const {
1139 return GFX940Insts;
1140 }
1141
1142 bool hasNSAtoVMEMBug() const {
1143 return HasNSAtoVMEMBug;
1144 }
1145
1146 bool hasNSAClauseBug() const { return HasNSAClauseBug; }
1147
1148 bool hasHardClauses() const { return getGeneration() >= GFX10; }
1149
1150 bool hasGFX90AInsts() const { return GFX90AInsts; }
1151
1152 bool hasFPAtomicToDenormModeHazard() const {
1153 return getGeneration() == GFX10;
1154 }
1155
1156 bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
1157
1158 bool hasLdsDirect() const { return getGeneration() >= GFX11; }
1159
1160 bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; }
1161
1162 bool hasVALUPartialForwardingHazard() const {
1163 return getGeneration() == GFX11;
1164 }
1165
1166 bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; }
1167
1168 bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; }
1169
1170 bool requiresCodeObjectV6() const { return RequiresCOV6; }
1171
1172 bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
1173
1174 /// Return if operations acting on VGPR tuples require even alignment.
1175 bool needsAlignedVGPRs() const { return GFX90AInsts; }
1176
1177 /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
1178 bool hasSPackHL() const { return GFX11Insts; }
1179
1180 /// Return true if the target's EXP instruction has the COMPR flag, which
1181 /// affects the meaning of the EN (enable) bits.
1182 bool hasCompressedExport() const { return !GFX11Insts; }
1183
1184 /// Return true if the target's EXP instruction supports the NULL export
1185 /// target.
1186 bool hasNullExportTarget() const { return !GFX11Insts; }
1187
1188 bool hasGFX11FullVGPRs() const { return HasGFX11FullVGPRs; }
1189
1190 bool hasVOPDInsts() const { return HasVOPDInsts; }
1191
1192 bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
1193
1194 /// Return true if the target has the S_DELAY_ALU instruction.
1195 bool hasDelayAlu() const { return GFX11Insts; }
1196
1197 bool hasPackedTID() const { return HasPackedTID; }
1198
1199 // GFX940 is a derivation to GFX90A. hasGFX940Insts() being true implies that
1200 // hasGFX90AInsts is also true.
1201 bool hasGFX940Insts() const { return GFX940Insts; }
1202
1203 bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }
1204
1205 bool hasVGPRSingleUseHintInsts() const { return HasVGPRSingleUseHintInsts; }
1206
1207 bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }
1208
1209 bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }
1210
1211 /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
1212 /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
1213 bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
1214
1215 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
1216 /// SGPRs
1217 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
1218
1219 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
1220 /// VGPRs
1221 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
1222
1223 /// Return occupancy for the given function. Used LDS and a number of
1224 /// registers if provided.
1225 /// Note, occupancy can be affected by the scratch allocation as well, but
1226 /// we do not have enough information to compute it.
1227 unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0,
1228 unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const;
1229
1230 /// \returns true if the flat_scratch register should be initialized with the
1231 /// pointer to the wave's scratch memory rather than a size and offset.
1232 bool flatScratchIsPointer() const {
1233 return getGeneration() >= AMDGPUSubtarget::GFX9;
1234 }
1235
1236 /// \returns true if the flat_scratch register is initialized by the HW.
1237 /// In this case it is readonly.
1238 bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
1239
1240 /// \returns true if the architected SGPRs are enabled.
1241 bool hasArchitectedSGPRs() const { return HasArchitectedSGPRs; }
1242
1243 /// \returns true if Global Data Share is supported.
1244 bool hasGDS() const { return HasGDS; }
1245
1246 /// \returns true if Global Wave Sync is supported.
1247 bool hasGWS() const { return HasGWS; }
1248
1249 /// \returns true if the machine has merged shaders in which s0-s7 are
1250 /// reserved by the hardware and user SGPRs start at s8
1251 bool hasMergedShaders() const {
1252 return getGeneration() >= GFX9;
1253 }
1254
1255 // \returns true if the target supports the pre-NGG legacy geometry path.
1256 bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
1257
1258 // \returns true if preloading kernel arguments is supported.
1259 bool hasKernargPreload() const { return KernargPreload; }
1260
1261 // \returns true if the target has split barriers feature
1262 bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
1263
1264 // \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable.
1265 bool hasCvtFP8VOP1Bug() const { return true; }
1266
1267 // \returns true if CSUB (a.k.a. SUB_CLAMP on GFX12) atomics support a
1268 // no-return form.
1269 bool hasAtomicCSubNoRtnInsts() const { return HasAtomicCSubNoRtnInsts; }
1270
1271 // \returns true if the target has DX10_CLAMP kernel descriptor mode bit
1272 bool hasDX10ClampMode() const { return getGeneration() < GFX12; }
1273
1274 // \returns true if the target has IEEE kernel descriptor mode bit
1275 bool hasIEEEMode() const { return getGeneration() < GFX12; }
1276
1277 // \returns true if the target has IEEE fminimum/fmaximum instructions
1278 bool hasIEEEMinMax() const { return getGeneration() >= GFX12; }
1279
1280 // \returns true if the target has WG_RR_MODE kernel descriptor mode bit
1281 bool hasRrWGMode() const { return getGeneration() >= GFX12; }
1282
1283 /// \returns true if VADDR and SADDR fields in VSCRATCH can use negative
1284 /// values.
1285 bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
1286
1287 // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
1288 // of sign-extending.
1289 bool hasGetPCZeroExtension() const { return GFX12Insts; }
1290
1291 /// \returns SGPR allocation granularity supported by the subtarget.
1292 unsigned getSGPRAllocGranule() const {
1293 return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
1294 }
1295
1296 /// \returns SGPR encoding granularity supported by the subtarget.
1297 unsigned getSGPREncodingGranule() const {
1298 return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
1299 }
1300
1301 /// \returns Total number of SGPRs supported by the subtarget.
1302 unsigned getTotalNumSGPRs() const {
1303 return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
1304 }
1305
1306 /// \returns Addressable number of SGPRs supported by the subtarget.
1307 unsigned getAddressableNumSGPRs() const {
1308 return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
1309 }
1310
1311 /// \returns Minimum number of SGPRs that meets the given number of waves per
1312 /// execution unit requirement supported by the subtarget.
1313 unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
1314 return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
1315 }
1316
1317 /// \returns Maximum number of SGPRs that meets the given number of waves per
1318 /// execution unit requirement supported by the subtarget.
1319 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
1320 return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
1321 }
1322
1323 /// \returns Reserved number of SGPRs. This is common
1324 /// utility function called by MachineFunction and
1325 /// Function variants of getReservedNumSGPRs.
1326 unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
1327 /// \returns Reserved number of SGPRs for given machine function \p MF.
1328 unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
1329
1330 /// \returns Reserved number of SGPRs for given function \p F.
1331 unsigned getReservedNumSGPRs(const Function &F) const;
1332
1333 /// \returns max num SGPRs. This is the common utility
1334 /// function called by MachineFunction and Function
1335 /// variants of getMaxNumSGPRs.
1336 unsigned getBaseMaxNumSGPRs(const Function &F,
1337 std::pair<unsigned, unsigned> WavesPerEU,
1338 unsigned PreloadedSGPRs,
1339 unsigned ReservedNumSGPRs) const;
1340
1341 /// \returns Maximum number of SGPRs that meets number of waves per execution
1342 /// unit requirement for function \p MF, or number of SGPRs explicitly
1343 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
1344 ///
1345 /// \returns Value that meets number of waves per execution unit requirement
1346 /// if explicitly requested value cannot be converted to integer, violates
1347 /// subtarget's specifications, or does not meet number of waves per execution
1348 /// unit requirement.
1349 unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
1350
1351 /// \returns Maximum number of SGPRs that meets number of waves per execution
1352 /// unit requirement for function \p F, or number of SGPRs explicitly
1353 /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
1354 ///
1355 /// \returns Value that meets number of waves per execution unit requirement
1356 /// if explicitly requested value cannot be converted to integer, violates
1357 /// subtarget's specifications, or does not meet number of waves per execution
1358 /// unit requirement.
1359 unsigned getMaxNumSGPRs(const Function &F) const;
1360
1361 /// \returns VGPR allocation granularity supported by the subtarget.
1362 unsigned getVGPRAllocGranule() const {
1363 return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
1364 }
1365
1366 /// \returns VGPR encoding granularity supported by the subtarget.
1367 unsigned getVGPREncodingGranule() const {
1368 return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
1369 }
1370
1371 /// \returns Total number of VGPRs supported by the subtarget.
1372 unsigned getTotalNumVGPRs() const {
1373 return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
1374 }
1375
1376 /// \returns Addressable number of VGPRs supported by the subtarget.
1377 unsigned getAddressableNumVGPRs() const {
1378 return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
1379 }
1380
1381 /// \returns the minimum number of VGPRs that will prevent achieving more than
1382 /// the specified number of waves \p WavesPerEU.
1383 unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
1384 return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
1385 }
1386
1387 /// \returns the maximum number of VGPRs that can be used and still achieved
1388 /// at least the specified number of waves \p WavesPerEU.
1389 unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
1390 return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
1391 }
1392
1393 /// \returns max num VGPRs. This is the common utility function
1394 /// called by MachineFunction and Function variants of getMaxNumVGPRs.
1395 unsigned getBaseMaxNumVGPRs(const Function &F,
1396 std::pair<unsigned, unsigned> WavesPerEU) const;
1397 /// \returns Maximum number of VGPRs that meets number of waves per execution
1398 /// unit requirement for function \p F, or number of VGPRs explicitly
1399 /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
1400 ///
1401 /// \returns Value that meets number of waves per execution unit requirement
1402 /// if explicitly requested value cannot be converted to integer, violates
1403 /// subtarget's specifications, or does not meet number of waves per execution
1404 /// unit requirement.
1405 unsigned getMaxNumVGPRs(const Function &F) const;
1406
1407 unsigned getMaxNumAGPRs(const Function &F) const {
1408 return getMaxNumVGPRs(F);
1409 }
1410
1411 /// \returns Maximum number of VGPRs that meets number of waves per execution
1412 /// unit requirement for function \p MF, or number of VGPRs explicitly
1413 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
1414 ///
1415 /// \returns Value that meets number of waves per execution unit requirement
1416 /// if explicitly requested value cannot be converted to integer, violates
1417 /// subtarget's specifications, or does not meet number of waves per execution
1418 /// unit requirement.
1419 unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
1420
1421 void getPostRAMutations(
1422 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
1423 const override;
1424
1425 std::unique_ptr<ScheduleDAGMutation>
1426 createFillMFMAShadowMutation(const TargetInstrInfo *TII) const;
1427
1428 bool isWave32() const {
1429 return getWavefrontSize() == 32;
1430 }
1431
1432 bool isWave64() const {
1433 return getWavefrontSize() == 64;
1434 }
1435
1436 const TargetRegisterClass *getBoolRC() const {
1437 return getRegisterInfo()->getBoolRC();
1438 }
1439
1440 /// \returns Maximum number of work groups per compute unit supported by the
1441 /// subtarget and limited by given \p FlatWorkGroupSize.
1442 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
1443 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
1444 }
1445
1446 /// \returns Minimum flat work group size supported by the subtarget.
1447 unsigned getMinFlatWorkGroupSize() const override {
1448 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
1449 }
1450
1451 /// \returns Maximum flat work group size supported by the subtarget.
1452 unsigned getMaxFlatWorkGroupSize() const override {
1453 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
1454 }
1455
1456 /// \returns Number of waves per execution unit required to support the given
1457 /// \p FlatWorkGroupSize.
1458 unsigned
1459 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
1460 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
1461 }
1462
1463 /// \returns Minimum number of waves per execution unit supported by the
1464 /// subtarget.
1465 unsigned getMinWavesPerEU() const override {
1466 return AMDGPU::IsaInfo::getMinWavesPerEU(this);
1467 }
1468
1469 void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
1470 SDep &Dep) const override;
1471
1472 // \returns true if it's beneficial on this subtarget for the scheduler to
1473 // cluster stores as well as loads.
1474 bool shouldClusterStores() const { return getGeneration() >= GFX11; }
1475
1476 // \returns the number of address arguments from which to enable MIMG NSA
1477 // on supported architectures.
1478 unsigned getNSAThreshold(const MachineFunction &MF) const;
1479
1480 // \returns true if the subtarget has a hazard requiring an "s_nop 0"
1481 // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
1482 bool requiresNopBeforeDeallocVGPRs() const {
1483 // Currently all targets that support the dealloc VGPRs message also require
1484 // the nop.
1485 return true;
1486 }
1487};
1488
1489class GCNUserSGPRUsageInfo {
1490public:
1491 bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; }
1492
1493 bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; }
1494
1495 bool hasDispatchPtr() const { return DispatchPtr; }
1496
1497 bool hasQueuePtr() const { return QueuePtr; }
1498
1499 bool hasKernargSegmentPtr() const { return KernargSegmentPtr; }
1500
1501 bool hasDispatchID() const { return DispatchID; }
1502
1503 bool hasFlatScratchInit() const { return FlatScratchInit; }
1504
1505 unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }
1506
1507 unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }
1508
1509 unsigned getNumFreeUserSGPRs();
1510
1511 void allocKernargPreloadSGPRs(unsigned NumSGPRs);
1512
1513 enum UserSGPRID : unsigned {
1514 ImplicitBufferPtrID = 0,
1515 PrivateSegmentBufferID = 1,
1516 DispatchPtrID = 2,
1517 QueuePtrID = 3,
1518 KernargSegmentPtrID = 4,
1519 DispatchIdID = 5,
1520 FlatScratchInitID = 6,
1521 PrivateSegmentSizeID = 7
1522 };
1523
1524 // Returns the size in number of SGPRs for preload user SGPR field.
1525 static unsigned getNumUserSGPRForField(UserSGPRID ID) {
1526 switch (ID) {
1527 case ImplicitBufferPtrID:
1528 return 2;
1529 case PrivateSegmentBufferID:
1530 return 4;
1531 case DispatchPtrID:
1532 return 2;
1533 case QueuePtrID:
1534 return 2;
1535 case KernargSegmentPtrID:
1536 return 2;
1537 case DispatchIdID:
1538 return 2;
1539 case FlatScratchInitID:
1540 return 2;
1541 case PrivateSegmentSizeID:
1542 return 1;
1543 }
1544 llvm_unreachable("Unknown UserSGPRID.");
1545 }
1546
1547 GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST);
1548
1549private:
1550 const GCNSubtarget &ST;
1551
1552 // Private memory buffer
1553 // Compute directly in sgpr[0:1]
1554 // Other shaders indirect 64-bits at sgpr[0:1]
1555 bool ImplicitBufferPtr = false;
1556
1557 bool PrivateSegmentBuffer = false;
1558
1559 bool DispatchPtr = false;
1560
1561 bool QueuePtr = false;
1562
1563 bool KernargSegmentPtr = false;
1564
1565 bool DispatchID = false;
1566
1567 bool FlatScratchInit = false;
1568
1569 unsigned NumKernargPreloadSGPRs = 0;
1570
1571 unsigned NumUsedUserSGPRs = 0;
1572};
1573
1574} // end namespace llvm
1575
1576#endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
1577

source code of llvm/lib/Target/AMDGPU/GCNSubtarget.h