1 | //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// Implements the AMDGPU specific subclass of TargetSubtarget. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "AMDGPUSubtarget.h" |
15 | #include "AMDGPUCallLowering.h" |
16 | #include "AMDGPUInstructionSelector.h" |
17 | #include "AMDGPULegalizerInfo.h" |
18 | #include "AMDGPURegisterBankInfo.h" |
19 | #include "AMDGPUTargetMachine.h" |
20 | #include "GCNSubtarget.h" |
21 | #include "R600Subtarget.h" |
22 | #include "SIMachineFunctionInfo.h" |
23 | #include "Utils/AMDGPUBaseInfo.h" |
24 | #include "llvm/ADT/SmallString.h" |
25 | #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" |
26 | #include "llvm/CodeGen/MachineScheduler.h" |
27 | #include "llvm/CodeGen/TargetFrameLowering.h" |
28 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
29 | #include "llvm/IR/IntrinsicsR600.h" |
30 | #include "llvm/IR/MDBuilder.h" |
31 | #include "llvm/MC/MCSubtargetInfo.h" |
32 | #include <algorithm> |
33 | |
34 | using namespace llvm; |
35 | |
36 | #define DEBUG_TYPE "amdgpu-subtarget" |
37 | |
38 | #define GET_SUBTARGETINFO_TARGET_DESC |
39 | #define GET_SUBTARGETINFO_CTOR |
40 | #define AMDGPUSubtarget GCNSubtarget |
41 | #include "AMDGPUGenSubtargetInfo.inc" |
42 | #undef AMDGPUSubtarget |
43 | |
44 | static cl::opt<bool> EnablePowerSched( |
45 | "amdgpu-enable-power-sched" , |
46 | cl::desc("Enable scheduling to minimize mAI power bursts" ), |
47 | cl::init(false)); |
48 | |
49 | static cl::opt<bool> EnableVGPRIndexMode( |
50 | "amdgpu-vgpr-index-mode" , |
51 | cl::desc("Use GPR indexing mode instead of movrel for vector indexing" ), |
52 | cl::init(Val: false)); |
53 | |
54 | static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen" , |
55 | cl::desc("Enable the use of AA during codegen." ), |
56 | cl::init(Val: true)); |
57 | |
58 | static cl::opt<unsigned> NSAThreshold("amdgpu-nsa-threshold" , |
59 | cl::desc("Number of addresses from which to enable MIMG NSA." ), |
60 | cl::init(Val: 3), cl::Hidden); |
61 | |
62 | GCNSubtarget::~GCNSubtarget() = default; |
63 | |
64 | GCNSubtarget & |
65 | GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, |
66 | StringRef GPU, StringRef FS) { |
67 | // Determine default and user-specified characteristics |
68 | // |
69 | // We want to be able to turn these off, but making this a subtarget feature |
70 | // for SI has the unhelpful behavior that it unsets everything else if you |
71 | // disable it. |
72 | // |
73 | // Similarly we want enable-prt-strict-null to be on by default and not to |
74 | // unset everything else if it is disabled |
75 | |
76 | SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128," ); |
77 | |
78 | // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default |
79 | if (isAmdHsaOS()) |
80 | FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler," ; |
81 | |
82 | FullFS += "+enable-prt-strict-null," ; // This is overridden by a disable in FS |
83 | |
84 | // Disable mutually exclusive bits. |
85 | if (FS.contains_insensitive(Other: "+wavefrontsize" )) { |
86 | if (!FS.contains_insensitive(Other: "wavefrontsize16" )) |
87 | FullFS += "-wavefrontsize16," ; |
88 | if (!FS.contains_insensitive(Other: "wavefrontsize32" )) |
89 | FullFS += "-wavefrontsize32," ; |
90 | if (!FS.contains_insensitive(Other: "wavefrontsize64" )) |
91 | FullFS += "-wavefrontsize64," ; |
92 | } |
93 | |
94 | FullFS += FS; |
95 | |
96 | ParseSubtargetFeatures(CPU: GPU, /*TuneCPU*/ GPU, FS: FullFS); |
97 | |
98 | // Implement the "generic" processors, which acts as the default when no |
99 | // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to |
100 | // the first amdgcn target that supports flat addressing. Other OSes defaults |
101 | // to the first amdgcn target. |
102 | if (Gen == AMDGPUSubtarget::INVALID) { |
103 | Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS |
104 | : AMDGPUSubtarget::SOUTHERN_ISLANDS; |
105 | } |
106 | |
107 | // We don't support FP64 for EG/NI atm. |
108 | assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); |
109 | |
110 | // Targets must either support 64-bit offsets for MUBUF instructions, and/or |
111 | // support flat operations, otherwise they cannot access a 64-bit global |
112 | // address space |
113 | assert(hasAddr64() || hasFlat()); |
114 | // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets |
115 | // that do not support ADDR64 variants of MUBUF instructions. Such targets |
116 | // cannot use a 64 bit offset with a MUBUF instruction to access the global |
117 | // address space |
118 | if (!hasAddr64() && !FS.contains(Other: "flat-for-global" ) && !FlatForGlobal) { |
119 | ToggleFeature(AMDGPU::FeatureFlatForGlobal); |
120 | FlatForGlobal = true; |
121 | } |
122 | // Unless +-flat-for-global is specified, use MUBUF instructions for global |
123 | // address space access if flat operations are not available. |
124 | if (!hasFlat() && !FS.contains(Other: "flat-for-global" ) && FlatForGlobal) { |
125 | ToggleFeature(AMDGPU::FeatureFlatForGlobal); |
126 | FlatForGlobal = false; |
127 | } |
128 | |
129 | // Set defaults if needed. |
130 | if (MaxPrivateElementSize == 0) |
131 | MaxPrivateElementSize = 4; |
132 | |
133 | if (LDSBankCount == 0) |
134 | LDSBankCount = 32; |
135 | |
136 | if (TT.getArch() == Triple::amdgcn) { |
137 | if (LocalMemorySize == 0) |
138 | LocalMemorySize = 32768; |
139 | |
140 | // Do something sensible for unspecified target. |
141 | if (!HasMovrel && !HasVGPRIndexMode) |
142 | HasMovrel = true; |
143 | } |
144 | |
145 | AddressableLocalMemorySize = LocalMemorySize; |
146 | |
147 | if (AMDGPU::isGFX10Plus(*this) && |
148 | !getFeatureBits().test(AMDGPU::FeatureCuMode)) |
149 | LocalMemorySize *= 2; |
150 | |
151 | // Don't crash on invalid devices. |
152 | if (WavefrontSizeLog2 == 0) |
153 | WavefrontSizeLog2 = 5; |
154 | |
155 | HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; |
156 | HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9; |
157 | |
158 | TargetID.setTargetIDFromFeaturesString(FS); |
159 | |
160 | LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " |
161 | << TargetID.getXnackSetting() << '\n'); |
162 | LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " |
163 | << TargetID.getSramEccSetting() << '\n'); |
164 | |
165 | return *this; |
166 | } |
167 | |
168 | AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : TargetTriple(TT) {} |
169 | |
170 | bool AMDGPUSubtarget::useRealTrue16Insts() const { |
171 | return hasTrue16BitInsts() && EnableRealTrue16Insts; |
172 | } |
173 | |
174 | GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, |
175 | const GCNTargetMachine &TM) |
176 | : // clang-format off |
177 | AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), |
178 | AMDGPUSubtarget(TT), |
179 | TargetTriple(TT), |
180 | TargetID(*this), |
181 | InstrItins(getInstrItineraryForCPU(GPU)), |
182 | InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), |
183 | TLInfo(TM, *this), |
184 | FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { |
185 | // clang-format on |
186 | MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); |
187 | EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(this); |
188 | CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); |
189 | InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); |
190 | Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); |
191 | RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); |
192 | InstSelector.reset(new AMDGPUInstructionSelector( |
193 | *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); |
194 | } |
195 | |
196 | unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { |
197 | if (getGeneration() < GFX10) |
198 | return 1; |
199 | |
200 | switch (Opcode) { |
201 | case AMDGPU::V_LSHLREV_B64_e64: |
202 | case AMDGPU::V_LSHLREV_B64_gfx10: |
203 | case AMDGPU::V_LSHLREV_B64_e64_gfx11: |
204 | case AMDGPU::V_LSHLREV_B64_e32_gfx12: |
205 | case AMDGPU::V_LSHLREV_B64_e64_gfx12: |
206 | case AMDGPU::V_LSHL_B64_e64: |
207 | case AMDGPU::V_LSHRREV_B64_e64: |
208 | case AMDGPU::V_LSHRREV_B64_gfx10: |
209 | case AMDGPU::V_LSHRREV_B64_e64_gfx11: |
210 | case AMDGPU::V_LSHRREV_B64_e64_gfx12: |
211 | case AMDGPU::V_LSHR_B64_e64: |
212 | case AMDGPU::V_ASHRREV_I64_e64: |
213 | case AMDGPU::V_ASHRREV_I64_gfx10: |
214 | case AMDGPU::V_ASHRREV_I64_e64_gfx11: |
215 | case AMDGPU::V_ASHRREV_I64_e64_gfx12: |
216 | case AMDGPU::V_ASHR_I64_e64: |
217 | return 1; |
218 | } |
219 | |
220 | return 2; |
221 | } |
222 | |
223 | /// This list was mostly derived from experimentation. |
224 | bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const { |
225 | switch (Opcode) { |
226 | case AMDGPU::V_CVT_F16_F32_e32: |
227 | case AMDGPU::V_CVT_F16_F32_e64: |
228 | case AMDGPU::V_CVT_F16_U16_e32: |
229 | case AMDGPU::V_CVT_F16_U16_e64: |
230 | case AMDGPU::V_CVT_F16_I16_e32: |
231 | case AMDGPU::V_CVT_F16_I16_e64: |
232 | case AMDGPU::V_RCP_F16_e64: |
233 | case AMDGPU::V_RCP_F16_e32: |
234 | case AMDGPU::V_RSQ_F16_e64: |
235 | case AMDGPU::V_RSQ_F16_e32: |
236 | case AMDGPU::V_SQRT_F16_e64: |
237 | case AMDGPU::V_SQRT_F16_e32: |
238 | case AMDGPU::V_LOG_F16_e64: |
239 | case AMDGPU::V_LOG_F16_e32: |
240 | case AMDGPU::V_EXP_F16_e64: |
241 | case AMDGPU::V_EXP_F16_e32: |
242 | case AMDGPU::V_SIN_F16_e64: |
243 | case AMDGPU::V_SIN_F16_e32: |
244 | case AMDGPU::V_COS_F16_e64: |
245 | case AMDGPU::V_COS_F16_e32: |
246 | case AMDGPU::V_FLOOR_F16_e64: |
247 | case AMDGPU::V_FLOOR_F16_e32: |
248 | case AMDGPU::V_CEIL_F16_e64: |
249 | case AMDGPU::V_CEIL_F16_e32: |
250 | case AMDGPU::V_TRUNC_F16_e64: |
251 | case AMDGPU::V_TRUNC_F16_e32: |
252 | case AMDGPU::V_RNDNE_F16_e64: |
253 | case AMDGPU::V_RNDNE_F16_e32: |
254 | case AMDGPU::V_FRACT_F16_e64: |
255 | case AMDGPU::V_FRACT_F16_e32: |
256 | case AMDGPU::V_FREXP_MANT_F16_e64: |
257 | case AMDGPU::V_FREXP_MANT_F16_e32: |
258 | case AMDGPU::V_FREXP_EXP_I16_F16_e64: |
259 | case AMDGPU::V_FREXP_EXP_I16_F16_e32: |
260 | case AMDGPU::V_LDEXP_F16_e64: |
261 | case AMDGPU::V_LDEXP_F16_e32: |
262 | case AMDGPU::V_LSHLREV_B16_e64: |
263 | case AMDGPU::V_LSHLREV_B16_e32: |
264 | case AMDGPU::V_LSHRREV_B16_e64: |
265 | case AMDGPU::V_LSHRREV_B16_e32: |
266 | case AMDGPU::V_ASHRREV_I16_e64: |
267 | case AMDGPU::V_ASHRREV_I16_e32: |
268 | case AMDGPU::V_ADD_U16_e64: |
269 | case AMDGPU::V_ADD_U16_e32: |
270 | case AMDGPU::V_SUB_U16_e64: |
271 | case AMDGPU::V_SUB_U16_e32: |
272 | case AMDGPU::V_SUBREV_U16_e64: |
273 | case AMDGPU::V_SUBREV_U16_e32: |
274 | case AMDGPU::V_MUL_LO_U16_e64: |
275 | case AMDGPU::V_MUL_LO_U16_e32: |
276 | case AMDGPU::V_ADD_F16_e64: |
277 | case AMDGPU::V_ADD_F16_e32: |
278 | case AMDGPU::V_SUB_F16_e64: |
279 | case AMDGPU::V_SUB_F16_e32: |
280 | case AMDGPU::V_SUBREV_F16_e64: |
281 | case AMDGPU::V_SUBREV_F16_e32: |
282 | case AMDGPU::V_MUL_F16_e64: |
283 | case AMDGPU::V_MUL_F16_e32: |
284 | case AMDGPU::V_MAX_F16_e64: |
285 | case AMDGPU::V_MAX_F16_e32: |
286 | case AMDGPU::V_MIN_F16_e64: |
287 | case AMDGPU::V_MIN_F16_e32: |
288 | case AMDGPU::V_MAX_U16_e64: |
289 | case AMDGPU::V_MAX_U16_e32: |
290 | case AMDGPU::V_MIN_U16_e64: |
291 | case AMDGPU::V_MIN_U16_e32: |
292 | case AMDGPU::V_MAX_I16_e64: |
293 | case AMDGPU::V_MAX_I16_e32: |
294 | case AMDGPU::V_MIN_I16_e64: |
295 | case AMDGPU::V_MIN_I16_e32: |
296 | case AMDGPU::V_MAD_F16_e64: |
297 | case AMDGPU::V_MAD_U16_e64: |
298 | case AMDGPU::V_MAD_I16_e64: |
299 | case AMDGPU::V_FMA_F16_e64: |
300 | case AMDGPU::V_DIV_FIXUP_F16_e64: |
301 | // On gfx10, all 16-bit instructions preserve the high bits. |
302 | return getGeneration() <= AMDGPUSubtarget::GFX9; |
303 | case AMDGPU::V_MADAK_F16: |
304 | case AMDGPU::V_MADMK_F16: |
305 | case AMDGPU::V_MAC_F16_e64: |
306 | case AMDGPU::V_MAC_F16_e32: |
307 | case AMDGPU::V_FMAMK_F16: |
308 | case AMDGPU::V_FMAAK_F16: |
309 | case AMDGPU::V_FMAC_F16_e64: |
310 | case AMDGPU::V_FMAC_F16_e32: |
311 | // In gfx9, the preferred handling of the unused high 16-bits changed. Most |
312 | // instructions maintain the legacy behavior of 0ing. Some instructions |
313 | // changed to preserving the high bits. |
314 | return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; |
315 | case AMDGPU::V_MAD_MIXLO_F16: |
316 | case AMDGPU::V_MAD_MIXHI_F16: |
317 | default: |
318 | return false; |
319 | } |
320 | } |
321 | |
322 | // Returns the maximum per-workgroup LDS allocation size (in bytes) that still |
323 | // allows the given function to achieve an occupancy of NWaves waves per |
324 | // SIMD / EU, taking into account only the function's *maximum* workgroup size. |
325 | unsigned |
326 | AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, |
327 | const Function &F) const { |
328 | const unsigned WaveSize = getWavefrontSize(); |
329 | const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; |
330 | const unsigned WavesPerWorkgroup = |
331 | std::max(a: 1u, b: (WorkGroupSize + WaveSize - 1) / WaveSize); |
332 | |
333 | const unsigned WorkGroupsPerCU = |
334 | std::max(a: 1u, b: (NWaves * getEUsPerCU()) / WavesPerWorkgroup); |
335 | |
336 | return getLocalMemorySize() / WorkGroupsPerCU; |
337 | } |
338 | |
339 | // FIXME: Should return min,max range. |
340 | // |
341 | // Returns the maximum occupancy, in number of waves per SIMD / EU, that can |
342 | // be achieved when only the given function is running on the machine; and |
343 | // taking into account the overall number of wave slots, the (maximum) workgroup |
344 | // size, and the per-workgroup LDS allocation size. |
345 | unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, |
346 | const Function &F) const { |
347 | const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; |
348 | const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(FlatWorkGroupSize: MaxWorkGroupSize); |
349 | if (!MaxWorkGroupsPerCu) |
350 | return 0; |
351 | |
352 | const unsigned WaveSize = getWavefrontSize(); |
353 | |
354 | // FIXME: Do we need to account for alignment requirement of LDS rounding the |
355 | // size up? |
356 | // Compute restriction based on LDS usage |
357 | unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); |
358 | |
359 | // This can be queried with more LDS than is possible, so just assume the |
360 | // worst. |
361 | if (NumGroups == 0) |
362 | return 1; |
363 | |
364 | NumGroups = std::min(a: MaxWorkGroupsPerCu, b: NumGroups); |
365 | |
366 | // Round to the number of waves per CU. |
367 | const unsigned MaxGroupNumWaves = divideCeil(Numerator: MaxWorkGroupSize, Denominator: WaveSize); |
368 | unsigned MaxWaves = NumGroups * MaxGroupNumWaves; |
369 | |
370 | // Number of waves per EU (SIMD). |
371 | MaxWaves = divideCeil(Numerator: MaxWaves, Denominator: getEUsPerCU()); |
372 | |
373 | // Clamp to the maximum possible number of waves. |
374 | MaxWaves = std::min(a: MaxWaves, b: getMaxWavesPerEU()); |
375 | |
376 | // FIXME: Needs to be a multiple of the group size? |
377 | //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); |
378 | |
379 | assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && |
380 | "computed invalid occupancy" ); |
381 | return MaxWaves; |
382 | } |
383 | |
384 | unsigned |
385 | AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { |
386 | const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
387 | return getOccupancyWithLocalMemSize(Bytes: MFI->getLDSSize(), F: MF.getFunction()); |
388 | } |
389 | |
390 | std::pair<unsigned, unsigned> |
391 | AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { |
392 | switch (CC) { |
393 | case CallingConv::AMDGPU_VS: |
394 | case CallingConv::AMDGPU_LS: |
395 | case CallingConv::AMDGPU_HS: |
396 | case CallingConv::AMDGPU_ES: |
397 | case CallingConv::AMDGPU_GS: |
398 | case CallingConv::AMDGPU_PS: |
399 | return std::pair(1, getWavefrontSize()); |
400 | default: |
401 | return std::pair(1u, getMaxFlatWorkGroupSize()); |
402 | } |
403 | } |
404 | |
405 | std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( |
406 | const Function &F) const { |
407 | // Default minimum/maximum flat work group sizes. |
408 | std::pair<unsigned, unsigned> Default = |
409 | getDefaultFlatWorkGroupSize(CC: F.getCallingConv()); |
410 | |
411 | // Requested minimum/maximum flat work group sizes. |
412 | std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( |
413 | F, Name: "amdgpu-flat-work-group-size" , Default); |
414 | |
415 | // Make sure requested minimum is less than requested maximum. |
416 | if (Requested.first > Requested.second) |
417 | return Default; |
418 | |
419 | // Make sure requested values do not violate subtarget's specifications. |
420 | if (Requested.first < getMinFlatWorkGroupSize()) |
421 | return Default; |
422 | if (Requested.second > getMaxFlatWorkGroupSize()) |
423 | return Default; |
424 | |
425 | return Requested; |
426 | } |
427 | |
428 | std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU( |
429 | std::pair<unsigned, unsigned> Requested, |
430 | std::pair<unsigned, unsigned> FlatWorkGroupSizes) const { |
431 | // Default minimum/maximum number of waves per execution unit. |
432 | std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); |
433 | |
434 | // If minimum/maximum flat work group sizes were explicitly requested using |
435 | // "amdgpu-flat-workgroup-size" attribute, then set default minimum/maximum |
436 | // number of waves per execution unit to values implied by requested |
437 | // minimum/maximum flat work group sizes. |
438 | unsigned MinImpliedByFlatWorkGroupSize = |
439 | getWavesPerEUForWorkGroup(FlatWorkGroupSize: FlatWorkGroupSizes.second); |
440 | Default.first = MinImpliedByFlatWorkGroupSize; |
441 | |
442 | // Make sure requested minimum is less than requested maximum. |
443 | if (Requested.second && Requested.first > Requested.second) |
444 | return Default; |
445 | |
446 | // Make sure requested values do not violate subtarget's specifications. |
447 | if (Requested.first < getMinWavesPerEU() || |
448 | Requested.second > getMaxWavesPerEU()) |
449 | return Default; |
450 | |
451 | // Make sure requested values are compatible with values implied by requested |
452 | // minimum/maximum flat work group sizes. |
453 | if (Requested.first < MinImpliedByFlatWorkGroupSize) |
454 | return Default; |
455 | |
456 | return Requested; |
457 | } |
458 | |
459 | std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( |
460 | const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const { |
461 | // Default minimum/maximum number of waves per execution unit. |
462 | std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); |
463 | |
464 | // Requested minimum/maximum number of waves per execution unit. |
465 | std::pair<unsigned, unsigned> Requested = |
466 | AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-waves-per-eu" , Default, OnlyFirstRequired: true); |
467 | return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes); |
468 | } |
469 | |
470 | static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { |
471 | auto Node = Kernel.getMetadata(Kind: "reqd_work_group_size" ); |
472 | if (Node && Node->getNumOperands() == 3) |
473 | return mdconst::extract<ConstantInt>(MD: Node->getOperand(I: Dim))->getZExtValue(); |
474 | return std::numeric_limits<unsigned>::max(); |
475 | } |
476 | |
477 | bool AMDGPUSubtarget::isMesaKernel(const Function &F) const { |
478 | return isMesa3DOS() && !AMDGPU::isShader(CC: F.getCallingConv()); |
479 | } |
480 | |
481 | unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, |
482 | unsigned Dimension) const { |
483 | unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dim: Dimension); |
484 | if (ReqdSize != std::numeric_limits<unsigned>::max()) |
485 | return ReqdSize - 1; |
486 | return getFlatWorkGroupSizes(F: Kernel).second - 1; |
487 | } |
488 | |
489 | bool AMDGPUSubtarget::isSingleLaneExecution(const Function &Func) const { |
490 | for (int I = 0; I < 3; ++I) { |
491 | if (getMaxWorkitemID(Kernel: Func, Dimension: I) > 0) |
492 | return false; |
493 | } |
494 | |
495 | return true; |
496 | } |
497 | |
498 | bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { |
499 | Function *Kernel = I->getParent()->getParent(); |
500 | unsigned MinSize = 0; |
501 | unsigned MaxSize = getFlatWorkGroupSizes(F: *Kernel).second; |
502 | bool IdQuery = false; |
503 | |
504 | // If reqd_work_group_size is present it narrows value down. |
505 | if (auto *CI = dyn_cast<CallInst>(Val: I)) { |
506 | const Function *F = CI->getCalledFunction(); |
507 | if (F) { |
508 | unsigned Dim = UINT_MAX; |
509 | switch (F->getIntrinsicID()) { |
510 | case Intrinsic::amdgcn_workitem_id_x: |
511 | case Intrinsic::r600_read_tidig_x: |
512 | IdQuery = true; |
513 | [[fallthrough]]; |
514 | case Intrinsic::r600_read_local_size_x: |
515 | Dim = 0; |
516 | break; |
517 | case Intrinsic::amdgcn_workitem_id_y: |
518 | case Intrinsic::r600_read_tidig_y: |
519 | IdQuery = true; |
520 | [[fallthrough]]; |
521 | case Intrinsic::r600_read_local_size_y: |
522 | Dim = 1; |
523 | break; |
524 | case Intrinsic::amdgcn_workitem_id_z: |
525 | case Intrinsic::r600_read_tidig_z: |
526 | IdQuery = true; |
527 | [[fallthrough]]; |
528 | case Intrinsic::r600_read_local_size_z: |
529 | Dim = 2; |
530 | break; |
531 | default: |
532 | break; |
533 | } |
534 | |
535 | if (Dim <= 3) { |
536 | unsigned ReqdSize = getReqdWorkGroupSize(Kernel: *Kernel, Dim); |
537 | if (ReqdSize != std::numeric_limits<unsigned>::max()) |
538 | MinSize = MaxSize = ReqdSize; |
539 | } |
540 | } |
541 | } |
542 | |
543 | if (!MaxSize) |
544 | return false; |
545 | |
546 | // Range metadata is [Lo, Hi). For ID query we need to pass max size |
547 | // as Hi. For size query we need to pass Hi + 1. |
548 | if (IdQuery) |
549 | MinSize = 0; |
550 | else |
551 | ++MaxSize; |
552 | |
553 | MDBuilder MDB(I->getContext()); |
554 | MDNode *MaxWorkGroupSizeRange = MDB.createRange(Lo: APInt(32, MinSize), |
555 | Hi: APInt(32, MaxSize)); |
556 | I->setMetadata(KindID: LLVMContext::MD_range, Node: MaxWorkGroupSizeRange); |
557 | return true; |
558 | } |
559 | |
560 | unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { |
561 | assert(AMDGPU::isKernel(F.getCallingConv())); |
562 | |
563 | // We don't allocate the segment if we know the implicit arguments weren't |
564 | // used, even if the ABI implies we need them. |
565 | if (F.hasFnAttribute(Kind: "amdgpu-no-implicitarg-ptr" )) |
566 | return 0; |
567 | |
568 | if (isMesaKernel(F)) |
569 | return 16; |
570 | |
571 | // Assume all implicit inputs are used by default |
572 | const Module *M = F.getParent(); |
573 | unsigned NBytes = |
574 | AMDGPU::getAMDHSACodeObjectVersion(M: *M) >= AMDGPU::AMDHSA_COV5 ? 256 : 56; |
575 | return F.getFnAttributeAsParsedInteger(Kind: "amdgpu-implicitarg-num-bytes" , |
576 | Default: NBytes); |
577 | } |
578 | |
579 | uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, |
580 | Align &MaxAlign) const { |
581 | assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || |
582 | F.getCallingConv() == CallingConv::SPIR_KERNEL); |
583 | |
584 | const DataLayout &DL = F.getParent()->getDataLayout(); |
585 | uint64_t ExplicitArgBytes = 0; |
586 | MaxAlign = Align(1); |
587 | |
588 | for (const Argument &Arg : F.args()) { |
589 | const bool IsByRef = Arg.hasByRefAttr(); |
590 | Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); |
591 | Align Alignment = DL.getValueOrABITypeAlignment( |
592 | Alignment: IsByRef ? Arg.getParamAlign() : std::nullopt, Ty: ArgTy); |
593 | uint64_t AllocSize = DL.getTypeAllocSize(Ty: ArgTy); |
594 | ExplicitArgBytes = alignTo(Size: ExplicitArgBytes, A: Alignment) + AllocSize; |
595 | MaxAlign = std::max(a: MaxAlign, b: Alignment); |
596 | } |
597 | |
598 | return ExplicitArgBytes; |
599 | } |
600 | |
601 | unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, |
602 | Align &MaxAlign) const { |
603 | if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL && |
604 | F.getCallingConv() != CallingConv::SPIR_KERNEL) |
605 | return 0; |
606 | |
607 | uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); |
608 | |
609 | unsigned ExplicitOffset = getExplicitKernelArgOffset(); |
610 | |
611 | uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; |
612 | unsigned ImplicitBytes = getImplicitArgNumBytes(F); |
613 | if (ImplicitBytes != 0) { |
614 | const Align Alignment = getAlignmentForImplicitArgPtr(); |
615 | TotalSize = alignTo(Size: ExplicitArgBytes, A: Alignment) + ImplicitBytes; |
616 | MaxAlign = std::max(a: MaxAlign, b: Alignment); |
617 | } |
618 | |
619 | // Being able to dereference past the end is useful for emitting scalar loads. |
620 | return alignTo(Value: TotalSize, Align: 4); |
621 | } |
622 | |
623 | AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const { |
624 | return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32 |
625 | : AMDGPUDwarfFlavour::Wave64; |
626 | } |
627 | |
628 | void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, |
629 | unsigned NumRegionInstrs) const { |
630 | // Track register pressure so the scheduler can try to decrease |
631 | // pressure once register usage is above the threshold defined by |
632 | // SIRegisterInfo::getRegPressureSetLimit() |
633 | Policy.ShouldTrackPressure = true; |
634 | |
635 | // Enabling both top down and bottom up scheduling seems to give us less |
636 | // register spills than just using one of these approaches on its own. |
637 | Policy.OnlyTopDown = false; |
638 | Policy.OnlyBottomUp = false; |
639 | |
640 | // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. |
641 | if (!enableSIScheduler()) |
642 | Policy.ShouldTrackLaneMasks = true; |
643 | } |
644 | |
645 | void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const { |
646 | if (isWave32()) { |
647 | // Fix implicit $vcc operands after MIParser has verified that they match |
648 | // the instruction definitions. |
649 | for (auto &MBB : MF) { |
650 | for (auto &MI : MBB) |
651 | InstrInfo.fixImplicitOperands(MI); |
652 | } |
653 | } |
654 | } |
655 | |
656 | bool GCNSubtarget::hasMadF16() const { |
657 | return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1; |
658 | } |
659 | |
660 | bool GCNSubtarget::useVGPRIndexMode() const { |
661 | return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); |
662 | } |
663 | |
664 | bool GCNSubtarget::useAA() const { return UseAA; } |
665 | |
666 | unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { |
667 | if (getGeneration() >= AMDGPUSubtarget::GFX10) |
668 | return getMaxWavesPerEU(); |
669 | |
670 | if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { |
671 | if (SGPRs <= 80) |
672 | return 10; |
673 | if (SGPRs <= 88) |
674 | return 9; |
675 | if (SGPRs <= 100) |
676 | return 8; |
677 | return 7; |
678 | } |
679 | if (SGPRs <= 48) |
680 | return 10; |
681 | if (SGPRs <= 56) |
682 | return 9; |
683 | if (SGPRs <= 64) |
684 | return 8; |
685 | if (SGPRs <= 72) |
686 | return 7; |
687 | if (SGPRs <= 80) |
688 | return 6; |
689 | return 5; |
690 | } |
691 | |
692 | unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const { |
693 | return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs); |
694 | } |
695 | |
696 | unsigned |
697 | GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const { |
698 | if (getGeneration() >= AMDGPUSubtarget::GFX10) |
699 | return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. |
700 | |
701 | if (HasFlatScratch || HasArchitectedFlatScratch) { |
702 | if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) |
703 | return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). |
704 | if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) |
705 | return 4; // FLAT_SCRATCH, VCC (in that order). |
706 | } |
707 | |
708 | if (isXNACKEnabled()) |
709 | return 4; // XNACK, VCC (in that order). |
710 | return 2; // VCC. |
711 | } |
712 | |
713 | unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { |
714 | const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); |
715 | return getBaseReservedNumSGPRs(HasFlatScratch: MFI.getUserSGPRInfo().hasFlatScratchInit()); |
716 | } |
717 | |
718 | unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { |
719 | // In principle we do not need to reserve SGPR pair used for flat_scratch if |
720 | // we know flat instructions do not access the stack anywhere in the |
721 | // program. For now assume it's needed if we have flat instructions. |
722 | const bool KernelUsesFlatScratch = hasFlatAddressSpace(); |
723 | return getBaseReservedNumSGPRs(HasFlatScratch: KernelUsesFlatScratch); |
724 | } |
725 | |
726 | unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, |
727 | unsigned NumSGPRs, |
728 | unsigned NumVGPRs) const { |
729 | unsigned Occupancy = |
730 | std::min(getMaxWavesPerEU(), |
731 | getOccupancyWithLocalMemSize(LDSSize, F)); |
732 | if (NumSGPRs) |
733 | Occupancy = std::min(a: Occupancy, b: getOccupancyWithNumSGPRs(SGPRs: NumSGPRs)); |
734 | if (NumVGPRs) |
735 | Occupancy = std::min(a: Occupancy, b: getOccupancyWithNumVGPRs(NumVGPRs)); |
736 | return Occupancy; |
737 | } |
738 | |
739 | unsigned GCNSubtarget::getBaseMaxNumSGPRs( |
740 | const Function &F, std::pair<unsigned, unsigned> WavesPerEU, |
741 | unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const { |
742 | // Compute maximum number of SGPRs function can use using default/requested |
743 | // minimum number of waves per execution unit. |
744 | unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: false); |
745 | unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: true); |
746 | |
747 | // Check if maximum number of SGPRs was explicitly requested using |
748 | // "amdgpu-num-sgpr" attribute. |
749 | if (F.hasFnAttribute(Kind: "amdgpu-num-sgpr" )) { |
750 | unsigned Requested = |
751 | F.getFnAttributeAsParsedInteger(Kind: "amdgpu-num-sgpr" , Default: MaxNumSGPRs); |
752 | |
753 | // Make sure requested value does not violate subtarget's specifications. |
754 | if (Requested && (Requested <= ReservedNumSGPRs)) |
755 | Requested = 0; |
756 | |
757 | // If more SGPRs are required to support the input user/system SGPRs, |
758 | // increase to accommodate them. |
759 | // |
760 | // FIXME: This really ends up using the requested number of SGPRs + number |
761 | // of reserved special registers in total. Theoretically you could re-use |
762 | // the last input registers for these special registers, but this would |
763 | // require a lot of complexity to deal with the weird aliasing. |
764 | unsigned InputNumSGPRs = PreloadedSGPRs; |
765 | if (Requested && Requested < InputNumSGPRs) |
766 | Requested = InputNumSGPRs; |
767 | |
768 | // Make sure requested value is compatible with values implied by |
769 | // default/requested minimum/maximum number of waves per execution unit. |
770 | if (Requested && Requested > getMaxNumSGPRs(WavesPerEU: WavesPerEU.first, Addressable: false)) |
771 | Requested = 0; |
772 | if (WavesPerEU.second && |
773 | Requested && Requested < getMinNumSGPRs(WavesPerEU: WavesPerEU.second)) |
774 | Requested = 0; |
775 | |
776 | if (Requested) |
777 | MaxNumSGPRs = Requested; |
778 | } |
779 | |
780 | if (hasSGPRInitBug()) |
781 | MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; |
782 | |
783 | return std::min(a: MaxNumSGPRs - ReservedNumSGPRs, b: MaxAddressableNumSGPRs); |
784 | } |
785 | |
786 | unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { |
787 | const Function &F = MF.getFunction(); |
788 | const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); |
789 | return getBaseMaxNumSGPRs(F, WavesPerEU: MFI.getWavesPerEU(), PreloadedSGPRs: MFI.getNumPreloadedSGPRs(), |
790 | ReservedNumSGPRs: getReservedNumSGPRs(MF)); |
791 | } |
792 | |
793 | static unsigned getMaxNumPreloadedSGPRs() { |
794 | using USI = GCNUserSGPRUsageInfo; |
795 | // Max number of user SGPRs |
796 | const unsigned MaxUserSGPRs = |
797 | USI::getNumUserSGPRForField(ID: USI::PrivateSegmentBufferID) + |
798 | USI::getNumUserSGPRForField(ID: USI::DispatchPtrID) + |
799 | USI::getNumUserSGPRForField(ID: USI::QueuePtrID) + |
800 | USI::getNumUserSGPRForField(ID: USI::KernargSegmentPtrID) + |
801 | USI::getNumUserSGPRForField(ID: USI::DispatchIdID) + |
802 | USI::getNumUserSGPRForField(ID: USI::FlatScratchInitID) + |
803 | USI::getNumUserSGPRForField(ID: USI::ImplicitBufferPtrID); |
804 | |
805 | // Max number of system SGPRs |
806 | const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX |
807 | 1 + // WorkGroupIDY |
808 | 1 + // WorkGroupIDZ |
809 | 1 + // WorkGroupInfo |
810 | 1; // private segment wave byte offset |
811 | |
812 | // Max number of synthetic SGPRs |
813 | const unsigned SyntheticSGPRs = 1; // LDSKernelId |
814 | |
815 | return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs; |
816 | } |
817 | |
818 | unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const { |
819 | return getBaseMaxNumSGPRs(F, WavesPerEU: getWavesPerEU(F), PreloadedSGPRs: getMaxNumPreloadedSGPRs(), |
820 | ReservedNumSGPRs: getReservedNumSGPRs(F)); |
821 | } |
822 | |
823 | unsigned GCNSubtarget::getBaseMaxNumVGPRs( |
824 | const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const { |
825 | // Compute maximum number of VGPRs function can use using default/requested |
826 | // minimum number of waves per execution unit. |
827 | unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU: WavesPerEU.first); |
828 | |
829 | // Check if maximum number of VGPRs was explicitly requested using |
830 | // "amdgpu-num-vgpr" attribute. |
831 | if (F.hasFnAttribute(Kind: "amdgpu-num-vgpr" )) { |
832 | unsigned Requested = |
833 | F.getFnAttributeAsParsedInteger(Kind: "amdgpu-num-vgpr" , Default: MaxNumVGPRs); |
834 | |
835 | if (hasGFX90AInsts()) |
836 | Requested *= 2; |
837 | |
838 | // Make sure requested value is compatible with values implied by |
839 | // default/requested minimum/maximum number of waves per execution unit. |
840 | if (Requested && Requested > getMaxNumVGPRs(WavesPerEU: WavesPerEU.first)) |
841 | Requested = 0; |
842 | if (WavesPerEU.second && |
843 | Requested && Requested < getMinNumVGPRs(WavesPerEU: WavesPerEU.second)) |
844 | Requested = 0; |
845 | |
846 | if (Requested) |
847 | MaxNumVGPRs = Requested; |
848 | } |
849 | |
850 | return MaxNumVGPRs; |
851 | } |
852 | |
853 | unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const { |
854 | return getBaseMaxNumVGPRs(F, WavesPerEU: getWavesPerEU(F)); |
855 | } |
856 | |
857 | unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { |
858 | const Function &F = MF.getFunction(); |
859 | const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); |
860 | return getBaseMaxNumVGPRs(F, WavesPerEU: MFI.getWavesPerEU()); |
861 | } |
862 | |
863 | void GCNSubtarget::adjustSchedDependency( |
864 | SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, |
865 | const TargetSchedModel *SchedModel) const { |
866 | if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || |
867 | !Def->isInstr() || !Use->isInstr()) |
868 | return; |
869 | |
870 | MachineInstr *DefI = Def->getInstr(); |
871 | MachineInstr *UseI = Use->getInstr(); |
872 | |
873 | if (DefI->isBundle()) { |
874 | const SIRegisterInfo *TRI = getRegisterInfo(); |
875 | auto Reg = Dep.getReg(); |
876 | MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); |
877 | MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); |
878 | unsigned Lat = 0; |
879 | for (++I; I != E && I->isBundledWithPred(); ++I) { |
880 | if (I->modifiesRegister(Reg, TRI)) |
881 | Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); |
882 | else if (Lat) |
883 | --Lat; |
884 | } |
885 | Dep.setLatency(Lat); |
886 | } else if (UseI->isBundle()) { |
887 | const SIRegisterInfo *TRI = getRegisterInfo(); |
888 | auto Reg = Dep.getReg(); |
889 | MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); |
890 | MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); |
891 | unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); |
892 | for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { |
893 | if (I->readsRegister(Reg, TRI)) |
894 | break; |
895 | --Lat; |
896 | } |
897 | Dep.setLatency(Lat); |
898 | } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) { |
899 | // Work around the fact that SIInstrInfo::fixImplicitOperands modifies |
900 | // implicit operands which come from the MCInstrDesc, which can fool |
901 | // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit |
902 | // pseudo operands. |
903 | Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency( |
904 | DefI, DefOpIdx, UseI, UseOpIdx)); |
905 | } |
906 | } |
907 | |
908 | namespace { |
909 | struct FillMFMAShadowMutation : ScheduleDAGMutation { |
910 | const SIInstrInfo *TII; |
911 | |
912 | ScheduleDAGMI *DAG; |
913 | |
914 | FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} |
915 | |
916 | bool isSALU(const SUnit *SU) const { |
917 | const MachineInstr *MI = SU->getInstr(); |
918 | return MI && TII->isSALU(MI: *MI) && !MI->isTerminator(); |
919 | } |
920 | |
921 | bool isVALU(const SUnit *SU) const { |
922 | const MachineInstr *MI = SU->getInstr(); |
923 | return MI && TII->isVALU(MI: *MI); |
924 | } |
925 | |
926 | // Link as many SALU instructions in chain as possible. Return the size |
927 | // of the chain. Links up to MaxChain instructions. |
928 | unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, |
929 | SmallPtrSetImpl<SUnit *> &Visited) const { |
930 | SmallVector<SUnit *, 8> Worklist({To}); |
931 | unsigned Linked = 0; |
932 | |
933 | while (!Worklist.empty() && MaxChain-- > 0) { |
934 | SUnit *SU = Worklist.pop_back_val(); |
935 | if (!Visited.insert(Ptr: SU).second) |
936 | continue; |
937 | |
938 | LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); |
939 | dbgs() << "to\n" ; DAG->dumpNode(*SU); dbgs() << '\n'); |
940 | |
941 | if (SU != From && From != &DAG->ExitSU && DAG->canAddEdge(SuccSU: SU, PredSU: From)) |
942 | if (DAG->addEdge(SuccSU: SU, PredDep: SDep(From, SDep::Artificial))) |
943 | ++Linked; |
944 | |
945 | for (SDep &SI : From->Succs) { |
946 | SUnit *SUv = SI.getSUnit(); |
947 | if (SUv != From && SU != &DAG->ExitSU && isVALU(SU: SUv) && |
948 | DAG->canAddEdge(SuccSU: SUv, PredSU: SU)) |
949 | DAG->addEdge(SuccSU: SUv, PredDep: SDep(SU, SDep::Artificial)); |
950 | } |
951 | |
952 | for (SDep &SI : SU->Succs) { |
953 | SUnit *Succ = SI.getSUnit(); |
954 | if (Succ != SU && isSALU(SU: Succ)) |
955 | Worklist.push_back(Elt: Succ); |
956 | } |
957 | } |
958 | |
959 | return Linked; |
960 | } |
961 | |
962 | void apply(ScheduleDAGInstrs *DAGInstrs) override { |
963 | const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); |
964 | if (!ST.hasMAIInsts()) |
965 | return; |
966 | DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); |
967 | const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); |
968 | if (!TSchedModel || DAG->SUnits.empty()) |
969 | return; |
970 | |
971 | // Scan for MFMA long latency instructions and try to add a dependency |
972 | // of available SALU instructions to give them a chance to fill MFMA |
973 | // shadow. That is desirable to fill MFMA shadow with SALU instructions |
974 | // rather than VALU to prevent power consumption bursts and throttle. |
975 | auto LastSALU = DAG->SUnits.begin(); |
976 | auto E = DAG->SUnits.end(); |
977 | SmallPtrSet<SUnit*, 32> Visited; |
978 | for (SUnit &SU : DAG->SUnits) { |
979 | MachineInstr &MAI = *SU.getInstr(); |
980 | if (!TII->isMAI(MAI) || |
981 | MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || |
982 | MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64) |
983 | continue; |
984 | |
985 | unsigned Lat = TSchedModel->computeInstrLatency(MI: &MAI) - 1; |
986 | |
987 | LLVM_DEBUG(dbgs() << "Found MFMA: " ; DAG->dumpNode(SU); |
988 | dbgs() << "Need " << Lat |
989 | << " instructions to cover latency.\n" ); |
990 | |
991 | // Find up to Lat independent scalar instructions as early as |
992 | // possible such that they can be scheduled after this MFMA. |
993 | for ( ; Lat && LastSALU != E; ++LastSALU) { |
994 | if (Visited.count(Ptr: &*LastSALU)) |
995 | continue; |
996 | |
997 | if (&SU == &DAG->ExitSU || &SU == &*LastSALU || !isSALU(SU: &*LastSALU) || |
998 | !DAG->canAddEdge(SuccSU: &*LastSALU, PredSU: &SU)) |
999 | continue; |
1000 | |
1001 | Lat -= linkSALUChain(From: &SU, To: &*LastSALU, MaxChain: Lat, Visited); |
1002 | } |
1003 | } |
1004 | } |
1005 | }; |
1006 | } // namespace |
1007 | |
1008 | void GCNSubtarget::getPostRAMutations( |
1009 | std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { |
1010 | Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); |
1011 | } |
1012 | |
1013 | std::unique_ptr<ScheduleDAGMutation> |
1014 | GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const { |
1015 | return EnablePowerSched ? std::make_unique<FillMFMAShadowMutation>(&InstrInfo) |
1016 | : nullptr; |
1017 | } |
1018 | |
1019 | unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const { |
1020 | if (getGeneration() >= AMDGPUSubtarget::GFX12) |
1021 | return 0; // Not MIMG encoding. |
1022 | |
1023 | if (NSAThreshold.getNumOccurrences() > 0) |
1024 | return std::max(a: NSAThreshold.getValue(), b: 2u); |
1025 | |
1026 | int Value = MF.getFunction().getFnAttributeAsParsedInteger( |
1027 | Kind: "amdgpu-nsa-threshold" , Default: -1); |
1028 | if (Value > 0) |
1029 | return std::max(a: Value, b: 2); |
1030 | |
1031 | return 3; |
1032 | } |
1033 | |
1034 | const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { |
1035 | if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) |
1036 | return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); |
1037 | else |
1038 | return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); |
1039 | } |
1040 | |
1041 | const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { |
1042 | if (TM.getTargetTriple().getArch() == Triple::amdgcn) |
1043 | return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); |
1044 | else |
1045 | return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); |
1046 | } |
1047 | |
1048 | GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F, |
1049 | const GCNSubtarget &ST) |
1050 | : ST(ST) { |
1051 | const CallingConv::ID CC = F.getCallingConv(); |
1052 | const bool IsKernel = |
1053 | CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL; |
1054 | // FIXME: Should have analysis or something rather than attribute to detect |
1055 | // calls. |
1056 | const bool HasCalls = F.hasFnAttribute(Kind: "amdgpu-calls" ); |
1057 | // FIXME: This attribute is a hack, we just need an analysis on the function |
1058 | // to look for allocas. |
1059 | const bool HasStackObjects = F.hasFnAttribute(Kind: "amdgpu-stack-objects" ); |
1060 | |
1061 | if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0)) |
1062 | KernargSegmentPtr = true; |
1063 | |
1064 | bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F); |
1065 | if (IsAmdHsaOrMesa && !ST.enableFlatScratch()) |
1066 | PrivateSegmentBuffer = true; |
1067 | else if (ST.isMesaGfxShader(F)) |
1068 | ImplicitBufferPtr = true; |
1069 | |
1070 | if (!AMDGPU::isGraphics(CC)) { |
1071 | if (!F.hasFnAttribute(Kind: "amdgpu-no-dispatch-ptr" )) |
1072 | DispatchPtr = true; |
1073 | |
1074 | // FIXME: Can this always be disabled with < COv5? |
1075 | if (!F.hasFnAttribute(Kind: "amdgpu-no-queue-ptr" )) |
1076 | QueuePtr = true; |
1077 | |
1078 | if (!F.hasFnAttribute(Kind: "amdgpu-no-dispatch-id" )) |
1079 | DispatchID = true; |
1080 | } |
1081 | |
1082 | // TODO: This could be refined a lot. The attribute is a poor way of |
1083 | // detecting calls or stack objects that may require it before argument |
1084 | // lowering. |
1085 | if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) && |
1086 | (IsAmdHsaOrMesa || ST.enableFlatScratch()) && |
1087 | (HasCalls || HasStackObjects || ST.enableFlatScratch()) && |
1088 | !ST.flatScratchIsArchitected()) { |
1089 | FlatScratchInit = true; |
1090 | } |
1091 | |
1092 | if (hasImplicitBufferPtr()) |
1093 | NumUsedUserSGPRs += getNumUserSGPRForField(ID: ImplicitBufferPtrID); |
1094 | |
1095 | if (hasPrivateSegmentBuffer()) |
1096 | NumUsedUserSGPRs += getNumUserSGPRForField(ID: PrivateSegmentBufferID); |
1097 | |
1098 | if (hasDispatchPtr()) |
1099 | NumUsedUserSGPRs += getNumUserSGPRForField(ID: DispatchPtrID); |
1100 | |
1101 | if (hasQueuePtr()) |
1102 | NumUsedUserSGPRs += getNumUserSGPRForField(ID: QueuePtrID); |
1103 | |
1104 | if (hasKernargSegmentPtr()) |
1105 | NumUsedUserSGPRs += getNumUserSGPRForField(ID: KernargSegmentPtrID); |
1106 | |
1107 | if (hasDispatchID()) |
1108 | NumUsedUserSGPRs += getNumUserSGPRForField(ID: DispatchIdID); |
1109 | |
1110 | if (hasFlatScratchInit()) |
1111 | NumUsedUserSGPRs += getNumUserSGPRForField(ID: FlatScratchInitID); |
1112 | } |
1113 | |
1114 | void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) { |
1115 | assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST)); |
1116 | NumKernargPreloadSGPRs += NumSGPRs; |
1117 | NumUsedUserSGPRs += NumSGPRs; |
1118 | } |
1119 | |
1120 | unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() { |
1121 | return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs; |
1122 | } |
1123 | |
1124 | SmallVector<unsigned> |
1125 | AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const { |
1126 | return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups" , 3); |
1127 | } |
1128 | |