1//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// The AMDGPU target machine contains all of the hardware specific
11/// information needed to emit code for SI+ GPUs.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUTargetMachine.h"
16#include "AMDGPU.h"
17#include "AMDGPUAliasAnalysis.h"
18#include "AMDGPUCtorDtorLowering.h"
19#include "AMDGPUExportClustering.h"
20#include "AMDGPUIGroupLP.h"
21#include "AMDGPUMacroFusion.h"
22#include "AMDGPURegBankSelect.h"
23#include "AMDGPUTargetObjectFile.h"
24#include "AMDGPUTargetTransformInfo.h"
25#include "AMDGPUUnifyDivergentExitNodes.h"
26#include "GCNIterativeScheduler.h"
27#include "GCNSchedStrategy.h"
28#include "GCNVOPDUtils.h"
29#include "R600.h"
30#include "R600MachineFunctionInfo.h"
31#include "R600TargetMachine.h"
32#include "SIMachineFunctionInfo.h"
33#include "SIMachineScheduler.h"
34#include "TargetInfo/AMDGPUTargetInfo.h"
35#include "Utils/AMDGPUBaseInfo.h"
36#include "llvm/Analysis/CGSCCPassManager.h"
37#include "llvm/Analysis/CallGraphSCCPass.h"
38#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
39#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
40#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
41#include "llvm/CodeGen/GlobalISel/Legalizer.h"
42#include "llvm/CodeGen/GlobalISel/Localizer.h"
43#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
44#include "llvm/CodeGen/MIRParser/MIParser.h"
45#include "llvm/CodeGen/Passes.h"
46#include "llvm/CodeGen/RegAllocRegistry.h"
47#include "llvm/CodeGen/TargetPassConfig.h"
48#include "llvm/IR/IntrinsicsAMDGPU.h"
49#include "llvm/IR/PassManager.h"
50#include "llvm/IR/PatternMatch.h"
51#include "llvm/InitializePasses.h"
52#include "llvm/MC/TargetRegistry.h"
53#include "llvm/Passes/PassBuilder.h"
54#include "llvm/Transforms/HipStdPar/HipStdPar.h"
55#include "llvm/Transforms/IPO.h"
56#include "llvm/Transforms/IPO/AlwaysInliner.h"
57#include "llvm/Transforms/IPO/GlobalDCE.h"
58#include "llvm/Transforms/IPO/Internalize.h"
59#include "llvm/Transforms/Scalar.h"
60#include "llvm/Transforms/Scalar/GVN.h"
61#include "llvm/Transforms/Scalar/InferAddressSpaces.h"
62#include "llvm/Transforms/Utils.h"
63#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
64#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
65#include <optional>
66
67using namespace llvm;
68using namespace llvm::PatternMatch;
69
70namespace {
71class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
72public:
73 SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
74 : RegisterRegAllocBase(N, D, C) {}
75};
76
77class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
78public:
79 VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
80 : RegisterRegAllocBase(N, D, C) {}
81};
82
83static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
84 const TargetRegisterClass &RC) {
85 return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
86}
87
88static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
89 const TargetRegisterClass &RC) {
90 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
91}
92
93
94/// -{sgpr|vgpr}-regalloc=... command line option.
95static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
96
97/// A dummy default pass factory indicates whether the register allocator is
98/// overridden on the command line.
99static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
100static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
101
102static SGPRRegisterRegAlloc
103defaultSGPRRegAlloc("default",
104 "pick SGPR register allocator based on -O option",
105 useDefaultRegisterAllocator);
106
107static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
108 RegisterPassParser<SGPRRegisterRegAlloc>>
109SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(Val: &useDefaultRegisterAllocator),
110 cl::desc("Register allocator to use for SGPRs"));
111
112static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
113 RegisterPassParser<VGPRRegisterRegAlloc>>
114VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(Val: &useDefaultRegisterAllocator),
115 cl::desc("Register allocator to use for VGPRs"));
116
117
118static void initializeDefaultSGPRRegisterAllocatorOnce() {
119 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
120
121 if (!Ctor) {
122 Ctor = SGPRRegAlloc;
123 SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc);
124 }
125}
126
127static void initializeDefaultVGPRRegisterAllocatorOnce() {
128 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
129
130 if (!Ctor) {
131 Ctor = VGPRRegAlloc;
132 VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc);
133 }
134}
135
136static FunctionPass *createBasicSGPRRegisterAllocator() {
137 return createBasicRegisterAllocator(F: onlyAllocateSGPRs);
138}
139
140static FunctionPass *createGreedySGPRRegisterAllocator() {
141 return createGreedyRegisterAllocator(F: onlyAllocateSGPRs);
142}
143
144static FunctionPass *createFastSGPRRegisterAllocator() {
145 return createFastRegisterAllocator(F: onlyAllocateSGPRs, ClearVirtRegs: false);
146}
147
148static FunctionPass *createBasicVGPRRegisterAllocator() {
149 return createBasicRegisterAllocator(F: onlyAllocateVGPRs);
150}
151
152static FunctionPass *createGreedyVGPRRegisterAllocator() {
153 return createGreedyRegisterAllocator(F: onlyAllocateVGPRs);
154}
155
156static FunctionPass *createFastVGPRRegisterAllocator() {
157 return createFastRegisterAllocator(F: onlyAllocateVGPRs, ClearVirtRegs: true);
158}
159
160static SGPRRegisterRegAlloc basicRegAllocSGPR(
161 "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
162static SGPRRegisterRegAlloc greedyRegAllocSGPR(
163 "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
164
165static SGPRRegisterRegAlloc fastRegAllocSGPR(
166 "fast", "fast register allocator", createFastSGPRRegisterAllocator);
167
168
169static VGPRRegisterRegAlloc basicRegAllocVGPR(
170 "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
171static VGPRRegisterRegAlloc greedyRegAllocVGPR(
172 "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
173
174static VGPRRegisterRegAlloc fastRegAllocVGPR(
175 "fast", "fast register allocator", createFastVGPRRegisterAllocator);
176}
177
178static cl::opt<bool>
179EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
180 cl::desc("Run early if-conversion"),
181 cl::init(Val: false));
182
183static cl::opt<bool>
184OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
185 cl::desc("Run pre-RA exec mask optimizations"),
186 cl::init(Val: true));
187
188static cl::opt<bool>
189 LowerCtorDtor("amdgpu-lower-global-ctor-dtor",
190 cl::desc("Lower GPU ctor / dtors to globals on the device."),
191 cl::init(Val: true), cl::Hidden);
192
193// Option to disable vectorizer for tests.
194static cl::opt<bool> EnableLoadStoreVectorizer(
195 "amdgpu-load-store-vectorizer",
196 cl::desc("Enable load store vectorizer"),
197 cl::init(Val: true),
198 cl::Hidden);
199
200// Option to control global loads scalarization
201static cl::opt<bool> ScalarizeGlobal(
202 "amdgpu-scalarize-global-loads",
203 cl::desc("Enable global load scalarization"),
204 cl::init(Val: true),
205 cl::Hidden);
206
207// Option to run internalize pass.
208static cl::opt<bool> InternalizeSymbols(
209 "amdgpu-internalize-symbols",
210 cl::desc("Enable elimination of non-kernel functions and unused globals"),
211 cl::init(Val: false),
212 cl::Hidden);
213
214// Option to inline all early.
215static cl::opt<bool> EarlyInlineAll(
216 "amdgpu-early-inline-all",
217 cl::desc("Inline all functions early"),
218 cl::init(Val: false),
219 cl::Hidden);
220
221static cl::opt<bool> RemoveIncompatibleFunctions(
222 "amdgpu-enable-remove-incompatible-functions", cl::Hidden,
223 cl::desc("Enable removal of functions when they"
224 "use features not supported by the target GPU"),
225 cl::init(Val: true));
226
227static cl::opt<bool> EnableSDWAPeephole(
228 "amdgpu-sdwa-peephole",
229 cl::desc("Enable SDWA peepholer"),
230 cl::init(Val: true));
231
232static cl::opt<bool> EnableDPPCombine(
233 "amdgpu-dpp-combine",
234 cl::desc("Enable DPP combiner"),
235 cl::init(Val: true));
236
237// Enable address space based alias analysis
238static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
239 cl::desc("Enable AMDGPU Alias Analysis"),
240 cl::init(Val: true));
241
242// Option to run late CFG structurizer
243static cl::opt<bool, true> LateCFGStructurize(
244 "amdgpu-late-structurize",
245 cl::desc("Enable late CFG structurization"),
246 cl::location(L&: AMDGPUTargetMachine::EnableLateStructurizeCFG),
247 cl::Hidden);
248
249// Disable structurizer-based control-flow lowering in order to test convergence
250// control tokens. This should eventually be replaced by the wave-transform.
251static cl::opt<bool, true> DisableStructurizer(
252 "amdgpu-disable-structurizer",
253 cl::desc("Disable structurizer for experiments; produces unusable code"),
254 cl::location(L&: AMDGPUTargetMachine::DisableStructurizer), cl::ReallyHidden);
255
256// Enable lib calls simplifications
257static cl::opt<bool> EnableLibCallSimplify(
258 "amdgpu-simplify-libcall",
259 cl::desc("Enable amdgpu library simplifications"),
260 cl::init(Val: true),
261 cl::Hidden);
262
263static cl::opt<bool> EnableLowerKernelArguments(
264 "amdgpu-ir-lower-kernel-arguments",
265 cl::desc("Lower kernel argument loads in IR pass"),
266 cl::init(Val: true),
267 cl::Hidden);
268
269static cl::opt<bool> EnableRegReassign(
270 "amdgpu-reassign-regs",
271 cl::desc("Enable register reassign optimizations on gfx10+"),
272 cl::init(Val: true),
273 cl::Hidden);
274
275static cl::opt<bool> OptVGPRLiveRange(
276 "amdgpu-opt-vgpr-liverange",
277 cl::desc("Enable VGPR liverange optimizations for if-else structure"),
278 cl::init(Val: true), cl::Hidden);
279
280static cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy(
281 "amdgpu-atomic-optimizer-strategy",
282 cl::desc("Select DPP or Iterative strategy for scan"),
283 cl::init(Val: ScanOptions::Iterative),
284 cl::values(
285 clEnumValN(ScanOptions::DPP, "DPP", "Use DPP operations for scan"),
286 clEnumValN(ScanOptions::Iterative, "Iterative",
287 "Use Iterative approach for scan"),
288 clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer")));
289
290// Enable Mode register optimization
291static cl::opt<bool> EnableSIModeRegisterPass(
292 "amdgpu-mode-register",
293 cl::desc("Enable mode register pass"),
294 cl::init(Val: true),
295 cl::Hidden);
296
297// Enable GFX11.5+ s_singleuse_vdst insertion
298static cl::opt<bool>
299 EnableInsertSingleUseVDST("amdgpu-enable-single-use-vdst",
300 cl::desc("Enable s_singleuse_vdst insertion"),
301 cl::init(Val: false), cl::Hidden);
302
303// Enable GFX11+ s_delay_alu insertion
304static cl::opt<bool>
305 EnableInsertDelayAlu("amdgpu-enable-delay-alu",
306 cl::desc("Enable s_delay_alu insertion"),
307 cl::init(Val: true), cl::Hidden);
308
309// Enable GFX11+ VOPD
310static cl::opt<bool>
311 EnableVOPD("amdgpu-enable-vopd",
312 cl::desc("Enable VOPD, dual issue of VALU in wave32"),
313 cl::init(Val: true), cl::Hidden);
314
315// Option is used in lit tests to prevent deadcoding of patterns inspected.
316static cl::opt<bool>
317EnableDCEInRA("amdgpu-dce-in-ra",
318 cl::init(Val: true), cl::Hidden,
319 cl::desc("Enable machine DCE inside regalloc"));
320
321static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
322 cl::desc("Adjust wave priority"),
323 cl::init(Val: false), cl::Hidden);
324
325static cl::opt<bool> EnableScalarIRPasses(
326 "amdgpu-scalar-ir-passes",
327 cl::desc("Enable scalar IR passes"),
328 cl::init(Val: true),
329 cl::Hidden);
330
331static cl::opt<bool> EnableStructurizerWorkarounds(
332 "amdgpu-enable-structurizer-workarounds",
333 cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(Val: true),
334 cl::Hidden);
335
336static cl::opt<bool, true> EnableLowerModuleLDS(
337 "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
338 cl::location(L&: AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(Val: true),
339 cl::Hidden);
340
341static cl::opt<bool> EnablePreRAOptimizations(
342 "amdgpu-enable-pre-ra-optimizations",
343 cl::desc("Enable Pre-RA optimizations pass"), cl::init(Val: true),
344 cl::Hidden);
345
346static cl::opt<bool> EnablePromoteKernelArguments(
347 "amdgpu-enable-promote-kernel-arguments",
348 cl::desc("Enable promotion of flat kernel pointer arguments to global"),
349 cl::Hidden, cl::init(Val: true));
350
351static cl::opt<bool> EnableImageIntrinsicOptimizer(
352 "amdgpu-enable-image-intrinsic-optimizer",
353 cl::desc("Enable image intrinsic optimizer pass"), cl::init(Val: true),
354 cl::Hidden);
355
356static cl::opt<bool>
357 EnableLoopPrefetch("amdgpu-loop-prefetch",
358 cl::desc("Enable loop data prefetch on AMDGPU"),
359 cl::Hidden, cl::init(Val: false));
360
361static cl::opt<bool> EnableMaxIlpSchedStrategy(
362 "amdgpu-enable-max-ilp-scheduling-strategy",
363 cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
364 cl::Hidden, cl::init(Val: false));
365
366static cl::opt<bool> EnableRewritePartialRegUses(
367 "amdgpu-enable-rewrite-partial-reg-uses",
368 cl::desc("Enable rewrite partial reg uses pass"), cl::init(Val: true),
369 cl::Hidden);
370
371static cl::opt<bool> EnableHipStdPar(
372 "amdgpu-enable-hipstdpar",
373 cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(Val: false),
374 cl::Hidden);
375
376extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
377 // Register the target
378 RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
379 RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
380
381 PassRegistry *PR = PassRegistry::getPassRegistry();
382 initializeR600ClauseMergePassPass(*PR);
383 initializeR600ControlFlowFinalizerPass(*PR);
384 initializeR600PacketizerPass(*PR);
385 initializeR600ExpandSpecialInstrsPassPass(*PR);
386 initializeR600VectorRegMergerPass(*PR);
387 initializeGlobalISel(*PR);
388 initializeAMDGPUDAGToDAGISelPass(*PR);
389 initializeGCNDPPCombinePass(*PR);
390 initializeSILowerI1CopiesPass(*PR);
391 initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR);
392 initializeSILowerWWMCopiesPass(*PR);
393 initializeAMDGPUMarkLastScratchLoadPass(*PR);
394 initializeSILowerSGPRSpillsPass(*PR);
395 initializeSIFixSGPRCopiesPass(*PR);
396 initializeSIFixVGPRCopiesPass(*PR);
397 initializeSIFoldOperandsPass(*PR);
398 initializeSIPeepholeSDWAPass(*PR);
399 initializeSIShrinkInstructionsPass(*PR);
400 initializeSIOptimizeExecMaskingPreRAPass(*PR);
401 initializeSIOptimizeVGPRLiveRangePass(*PR);
402 initializeSILoadStoreOptimizerPass(*PR);
403 initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
404 initializeAMDGPUAlwaysInlinePass(*PR);
405 initializeAMDGPUAttributorLegacyPass(*PR);
406 initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
407 initializeAMDGPUAnnotateUniformValuesPass(*PR);
408 initializeAMDGPUArgumentUsageInfoPass(*PR);
409 initializeAMDGPUAtomicOptimizerPass(*PR);
410 initializeAMDGPULowerKernelArgumentsPass(*PR);
411 initializeAMDGPUPromoteKernelArgumentsPass(*PR);
412 initializeAMDGPULowerKernelAttributesPass(*PR);
413 initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
414 initializeAMDGPUPostLegalizerCombinerPass(*PR);
415 initializeAMDGPUPreLegalizerCombinerPass(*PR);
416 initializeAMDGPURegBankCombinerPass(*PR);
417 initializeAMDGPURegBankSelectPass(*PR);
418 initializeAMDGPUPromoteAllocaPass(*PR);
419 initializeAMDGPUPromoteAllocaToVectorPass(*PR);
420 initializeAMDGPUCodeGenPreparePass(*PR);
421 initializeAMDGPULateCodeGenPreparePass(*PR);
422 initializeAMDGPURemoveIncompatibleFunctionsPass(*PR);
423 initializeAMDGPULowerModuleLDSLegacyPass(*PR);
424 initializeAMDGPULowerBufferFatPointersPass(*PR);
425 initializeAMDGPURewriteOutArgumentsPass(*PR);
426 initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
427 initializeAMDGPUUnifyMetadataPass(*PR);
428 initializeSIAnnotateControlFlowPass(*PR);
429 initializeAMDGPUInsertSingleUseVDSTPass(*PR);
430 initializeAMDGPUInsertDelayAluPass(*PR);
431 initializeSIInsertHardClausesPass(*PR);
432 initializeSIInsertWaitcntsPass(*PR);
433 initializeSIModeRegisterPass(*PR);
434 initializeSIWholeQuadModePass(*PR);
435 initializeSILowerControlFlowPass(*PR);
436 initializeSIPreEmitPeepholePass(*PR);
437 initializeSILateBranchLoweringPass(*PR);
438 initializeSIMemoryLegalizerPass(*PR);
439 initializeSIOptimizeExecMaskingPass(*PR);
440 initializeSIPreAllocateWWMRegsPass(*PR);
441 initializeSIFormMemoryClausesPass(*PR);
442 initializeSIPostRABundlerPass(*PR);
443 initializeGCNCreateVOPDPass(*PR);
444 initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
445 initializeAMDGPUAAWrapperPassPass(*PR);
446 initializeAMDGPUExternalAAWrapperPass(*PR);
447 initializeAMDGPUImageIntrinsicOptimizerPass(*PR);
448 initializeAMDGPUPrintfRuntimeBindingPass(*PR);
449 initializeAMDGPUResourceUsageAnalysisPass(*PR);
450 initializeGCNNSAReassignPass(*PR);
451 initializeGCNPreRAOptimizationsPass(*PR);
452 initializeGCNPreRALongBranchRegPass(*PR);
453 initializeGCNRewritePartialRegUsesPass(*PR);
454 initializeGCNRegPressurePrinterPass(*PR);
455}
456
457static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
458 return std::make_unique<AMDGPUTargetObjectFile>();
459}
460
461static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
462 return new SIScheduleDAGMI(C);
463}
464
465static ScheduleDAGInstrs *
466createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
467 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
468 ScheduleDAGMILive *DAG =
469 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(args&: C));
470 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
471 if (ST.shouldClusterStores())
472 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
473 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial));
474 DAG->addMutation(Mutation: createAMDGPUMacroFusionDAGMutation());
475 DAG->addMutation(Mutation: createAMDGPUExportClusteringDAGMutation());
476 return DAG;
477}
478
479static ScheduleDAGInstrs *
480createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
481 ScheduleDAGMILive *DAG =
482 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(args&: C));
483 DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial));
484 return DAG;
485}
486
487static ScheduleDAGInstrs *
488createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
489 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
490 auto DAG = new GCNIterativeScheduler(C,
491 GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
492 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
493 if (ST.shouldClusterStores())
494 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
495 return DAG;
496}
497
498static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
499 return new GCNIterativeScheduler(C,
500 GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
501}
502
503static ScheduleDAGInstrs *
504createIterativeILPMachineScheduler(MachineSchedContext *C) {
505 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
506 auto DAG = new GCNIterativeScheduler(C,
507 GCNIterativeScheduler::SCHEDULE_ILP);
508 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
509 if (ST.shouldClusterStores())
510 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
511 DAG->addMutation(Mutation: createAMDGPUMacroFusionDAGMutation());
512 return DAG;
513}
514
515static MachineSchedRegistry
516SISchedRegistry("si", "Run SI's custom scheduler",
517 createSIMachineScheduler);
518
519static MachineSchedRegistry
520GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
521 "Run GCN scheduler to maximize occupancy",
522 createGCNMaxOccupancyMachineScheduler);
523
524static MachineSchedRegistry
525 GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
526 createGCNMaxILPMachineScheduler);
527
528static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry(
529 "gcn-iterative-max-occupancy-experimental",
530 "Run GCN scheduler to maximize occupancy (experimental)",
531 createIterativeGCNMaxOccupancyMachineScheduler);
532
533static MachineSchedRegistry GCNMinRegSchedRegistry(
534 "gcn-iterative-minreg",
535 "Run GCN iterative scheduler for minimal register usage (experimental)",
536 createMinRegScheduler);
537
538static MachineSchedRegistry GCNILPSchedRegistry(
539 "gcn-iterative-ilp",
540 "Run GCN iterative scheduler for ILP scheduling (experimental)",
541 createIterativeILPMachineScheduler);
542
543static StringRef computeDataLayout(const Triple &TT) {
544 if (TT.getArch() == Triple::r600) {
545 // 32-bit pointers.
546 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
547 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
548 }
549
550 // 32-bit private, local, and region pointers. 64-bit global, constant and
551 // flat. 160-bit non-integral fat buffer pointers that include a 128-bit
552 // buffer descriptor and a 32-bit offset, which are indexed by 32-bit values
553 // (address space 7), and 128-bit non-integral buffer resourcees (address
554 // space 8) which cannot be non-trivilally accessed by LLVM memory operations
555 // like getelementptr.
556 return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
557 "-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-"
558 "v32:32-v48:64-v96:"
559 "128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-"
560 "G1-ni:7:8:9";
561}
562
563LLVM_READNONE
564static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
565 if (!GPU.empty())
566 return GPU;
567
568 // Need to default to a target with flat support for HSA.
569 if (TT.getArch() == Triple::amdgcn)
570 return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
571
572 return "r600";
573}
574
575static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
576 // The AMDGPU toolchain only supports generating shared objects, so we
577 // must always use PIC.
578 return Reloc::PIC_;
579}
580
581AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
582 StringRef CPU, StringRef FS,
583 const TargetOptions &Options,
584 std::optional<Reloc::Model> RM,
585 std::optional<CodeModel::Model> CM,
586 CodeGenOptLevel OptLevel)
587 : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, GPU: CPU),
588 FS, Options, getEffectiveRelocModel(RM),
589 getEffectiveCodeModel(CM, Default: CodeModel::Small), OptLevel),
590 TLOF(createTLOF(TT: getTargetTriple())) {
591 initAsmInfo();
592 if (TT.getArch() == Triple::amdgcn) {
593 if (getMCSubtargetInfo()->checkFeatures(FS: "+wavefrontsize64"))
594 MRI.reset(p: llvm::createGCNMCRegisterInfo(DwarfFlavour: AMDGPUDwarfFlavour::Wave64));
595 else if (getMCSubtargetInfo()->checkFeatures(FS: "+wavefrontsize32"))
596 MRI.reset(p: llvm::createGCNMCRegisterInfo(DwarfFlavour: AMDGPUDwarfFlavour::Wave32));
597 }
598}
599
600bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
601bool AMDGPUTargetMachine::EnableFunctionCalls = false;
602bool AMDGPUTargetMachine::EnableLowerModuleLDS = true;
603bool AMDGPUTargetMachine::DisableStructurizer = false;
604
605AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
606
607StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
608 Attribute GPUAttr = F.getFnAttribute(Kind: "target-cpu");
609 return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
610}
611
612StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
613 Attribute FSAttr = F.getFnAttribute(Kind: "target-features");
614
615 return FSAttr.isValid() ? FSAttr.getValueAsString()
616 : getTargetFeatureString();
617}
618
619/// Predicate for Internalize pass.
620static bool mustPreserveGV(const GlobalValue &GV) {
621 if (const Function *F = dyn_cast<Function>(Val: &GV))
622 return F->isDeclaration() || F->getName().starts_with(Prefix: "__asan_") ||
623 F->getName().starts_with(Prefix: "__sanitizer_") ||
624 AMDGPU::isEntryFunctionCC(CC: F->getCallingConv());
625
626 GV.removeDeadConstantUsers();
627 return !GV.use_empty();
628}
629
630void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
631 AAM.registerFunctionAnalysis<AMDGPUAA>();
632}
633
634static Expected<ScanOptions>
635parseAMDGPUAtomicOptimizerStrategy(StringRef Params) {
636 if (Params.empty())
637 return ScanOptions::Iterative;
638 Params.consume_front(Prefix: "strategy=");
639 auto Result = StringSwitch<std::optional<ScanOptions>>(Params)
640 .Case(S: "dpp", Value: ScanOptions::DPP)
641 .Cases(S0: "iterative", S1: "", Value: ScanOptions::Iterative)
642 .Case(S: "none", Value: ScanOptions::None)
643 .Default(Value: std::nullopt);
644 if (Result)
645 return *Result;
646 return make_error<StringError>(Args: "invalid parameter", Args: inconvertibleErrorCode());
647}
648
649void AMDGPUTargetMachine::registerPassBuilderCallbacks(
650 PassBuilder &PB, bool PopulateClassToPassNames) {
651
652#define GET_PASS_REGISTRY "AMDGPUPassRegistry.def"
653#include "llvm/Passes/TargetPassRegistry.inc"
654
655 PB.registerPipelineStartEPCallback(
656 C: [](ModulePassManager &PM, OptimizationLevel Level) {
657 FunctionPassManager FPM;
658 PM.addPass(Pass: createModuleToFunctionPassAdaptor(Pass: std::move(FPM)));
659 if (EnableHipStdPar)
660 PM.addPass(Pass: HipStdParAcceleratorCodeSelectionPass());
661 });
662
663 PB.registerPipelineEarlySimplificationEPCallback(
664 C: [](ModulePassManager &PM, OptimizationLevel Level) {
665 PM.addPass(Pass: AMDGPUPrintfRuntimeBindingPass());
666
667 if (Level == OptimizationLevel::O0)
668 return;
669
670 PM.addPass(Pass: AMDGPUUnifyMetadataPass());
671
672 if (InternalizeSymbols) {
673 PM.addPass(Pass: InternalizePass(mustPreserveGV));
674 PM.addPass(Pass: GlobalDCEPass());
675 }
676
677 if (EarlyInlineAll && !EnableFunctionCalls)
678 PM.addPass(Pass: AMDGPUAlwaysInlinePass());
679 });
680
681 PB.registerPeepholeEPCallback(
682 C: [](FunctionPassManager &FPM, OptimizationLevel Level) {
683 if (Level == OptimizationLevel::O0)
684 return;
685
686 FPM.addPass(Pass: AMDGPUUseNativeCallsPass());
687 if (EnableLibCallSimplify)
688 FPM.addPass(Pass: AMDGPUSimplifyLibCallsPass());
689 });
690
691 PB.registerCGSCCOptimizerLateEPCallback(
692 C: [this](CGSCCPassManager &PM, OptimizationLevel Level) {
693 if (Level == OptimizationLevel::O0)
694 return;
695
696 FunctionPassManager FPM;
697
698 // Add promote kernel arguments pass to the opt pipeline right before
699 // infer address spaces which is needed to do actual address space
700 // rewriting.
701 if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() &&
702 EnablePromoteKernelArguments)
703 FPM.addPass(Pass: AMDGPUPromoteKernelArgumentsPass());
704
705 // Add infer address spaces pass to the opt pipeline after inlining
706 // but before SROA to increase SROA opportunities.
707 FPM.addPass(Pass: InferAddressSpacesPass());
708
709 // This should run after inlining to have any chance of doing
710 // anything, and before other cleanup optimizations.
711 FPM.addPass(Pass: AMDGPULowerKernelAttributesPass());
712
713 if (Level != OptimizationLevel::O0) {
714 // Promote alloca to vector before SROA and loop unroll. If we
715 // manage to eliminate allocas before unroll we may choose to unroll
716 // less.
717 FPM.addPass(Pass: AMDGPUPromoteAllocaToVectorPass(*this));
718 }
719
720 PM.addPass(Pass: createCGSCCToFunctionPassAdaptor(Pass: std::move(FPM)));
721 });
722
723 PB.registerFullLinkTimeOptimizationLastEPCallback(
724 C: [this](ModulePassManager &PM, OptimizationLevel Level) {
725 // We want to support the -lto-partitions=N option as "best effort".
726 // For that, we need to lower LDS earlier in the pipeline before the
727 // module is partitioned for codegen.
728 if (EnableLowerModuleLDS)
729 PM.addPass(Pass: AMDGPULowerModuleLDSPass(*this));
730 });
731}
732
733int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
734 return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
735 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
736 AddrSpace == AMDGPUAS::REGION_ADDRESS)
737 ? -1
738 : 0;
739}
740
741bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
742 unsigned DestAS) const {
743 return AMDGPU::isFlatGlobalAddrSpace(AS: SrcAS) &&
744 AMDGPU::isFlatGlobalAddrSpace(AS: DestAS);
745}
746
747unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
748 const auto *LD = dyn_cast<LoadInst>(Val: V);
749 if (!LD)
750 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
751
752 // It must be a generic pointer loaded.
753 assert(V->getType()->isPointerTy() &&
754 V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
755
756 const auto *Ptr = LD->getPointerOperand();
757 if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
758 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
759 // For a generic pointer loaded from the constant memory, it could be assumed
760 // as a global pointer since the constant memory is only populated on the
761 // host side. As implied by the offload programming model, only global
762 // pointers could be referenced on the host side.
763 return AMDGPUAS::GLOBAL_ADDRESS;
764}
765
766std::pair<const Value *, unsigned>
767AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const {
768 if (auto *II = dyn_cast<IntrinsicInst>(Val: V)) {
769 switch (II->getIntrinsicID()) {
770 case Intrinsic::amdgcn_is_shared:
771 return std::pair(II->getArgOperand(i: 0), AMDGPUAS::LOCAL_ADDRESS);
772 case Intrinsic::amdgcn_is_private:
773 return std::pair(II->getArgOperand(i: 0), AMDGPUAS::PRIVATE_ADDRESS);
774 default:
775 break;
776 }
777 return std::pair(nullptr, -1);
778 }
779 // Check the global pointer predication based on
780 // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and
781 // the order of 'is_shared' and 'is_private' is not significant.
782 Value *Ptr;
783 if (match(
784 const_cast<Value *>(V),
785 m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))),
786 m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>(
787 m_Deferred(Ptr))))))
788 return std::pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS);
789
790 return std::pair(nullptr, -1);
791}
792
793unsigned
794AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
795 switch (Kind) {
796 case PseudoSourceValue::Stack:
797 case PseudoSourceValue::FixedStack:
798 return AMDGPUAS::PRIVATE_ADDRESS;
799 case PseudoSourceValue::ConstantPool:
800 case PseudoSourceValue::GOT:
801 case PseudoSourceValue::JumpTable:
802 case PseudoSourceValue::GlobalValueCallEntry:
803 case PseudoSourceValue::ExternalSymbolCallEntry:
804 return AMDGPUAS::CONSTANT_ADDRESS;
805 }
806 return AMDGPUAS::FLAT_ADDRESS;
807}
808
809//===----------------------------------------------------------------------===//
810// GCN Target Machine (SI+)
811//===----------------------------------------------------------------------===//
812
813GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
814 StringRef CPU, StringRef FS,
815 const TargetOptions &Options,
816 std::optional<Reloc::Model> RM,
817 std::optional<CodeModel::Model> CM,
818 CodeGenOptLevel OL, bool JIT)
819 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
820
821const TargetSubtargetInfo *
822GCNTargetMachine::getSubtargetImpl(const Function &F) const {
823 StringRef GPU = getGPUName(F);
824 StringRef FS = getFeatureString(F);
825
826 SmallString<128> SubtargetKey(GPU);
827 SubtargetKey.append(RHS: FS);
828
829 auto &I = SubtargetMap[SubtargetKey];
830 if (!I) {
831 // This needs to be done before we create a new subtarget since any
832 // creation will depend on the TM and the code generation flags on the
833 // function that reside in TargetOptions.
834 resetTargetOptions(F);
835 I = std::make_unique<GCNSubtarget>(args: TargetTriple, args&: GPU, args&: FS, args: *this);
836 }
837
838 I->setScalarizeGlobalBehavior(ScalarizeGlobal);
839
840 return I.get();
841}
842
843TargetTransformInfo
844GCNTargetMachine::getTargetTransformInfo(const Function &F) const {
845 return TargetTransformInfo(GCNTTIImpl(this, F));
846}
847
848//===----------------------------------------------------------------------===//
849// AMDGPU Pass Setup
850//===----------------------------------------------------------------------===//
851
852std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const {
853 return getStandardCSEConfigForOpt(Level: TM->getOptLevel());
854}
855
856namespace {
857
858class GCNPassConfig final : public AMDGPUPassConfig {
859public:
860 GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
861 : AMDGPUPassConfig(TM, PM) {
862 // It is necessary to know the register usage of the entire call graph. We
863 // allow calls without EnableAMDGPUFunctionCalls if they are marked
864 // noinline, so this is always required.
865 setRequiresCodeGenSCCOrder(true);
866 substitutePass(StandardID: &PostRASchedulerID, TargetID: &PostMachineSchedulerID);
867 }
868
869 GCNTargetMachine &getGCNTargetMachine() const {
870 return getTM<GCNTargetMachine>();
871 }
872
873 ScheduleDAGInstrs *
874 createMachineScheduler(MachineSchedContext *C) const override;
875
876 ScheduleDAGInstrs *
877 createPostMachineScheduler(MachineSchedContext *C) const override {
878 ScheduleDAGMI *DAG = new GCNPostScheduleDAGMILive(
879 C, std::make_unique<PostGenericScheduler>(args&: C),
880 /*RemoveKillFlags=*/true);
881 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
882 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
883 if (ST.shouldClusterStores())
884 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
885 DAG->addMutation(Mutation: ST.createFillMFMAShadowMutation(DAG->TII));
886 DAG->addMutation(
887 Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::PostRA));
888 if (isPassEnabled(Opt: EnableVOPD, Level: CodeGenOptLevel::Less))
889 DAG->addMutation(Mutation: createVOPDPairingMutation());
890 return DAG;
891 }
892
893 bool addPreISel() override;
894 void addMachineSSAOptimization() override;
895 bool addILPOpts() override;
896 bool addInstSelector() override;
897 bool addIRTranslator() override;
898 void addPreLegalizeMachineIR() override;
899 bool addLegalizeMachineIR() override;
900 void addPreRegBankSelect() override;
901 bool addRegBankSelect() override;
902 void addPreGlobalInstructionSelect() override;
903 bool addGlobalInstructionSelect() override;
904 void addFastRegAlloc() override;
905 void addOptimizedRegAlloc() override;
906
907 FunctionPass *createSGPRAllocPass(bool Optimized);
908 FunctionPass *createVGPRAllocPass(bool Optimized);
909 FunctionPass *createRegAllocPass(bool Optimized) override;
910
911 bool addRegAssignAndRewriteFast() override;
912 bool addRegAssignAndRewriteOptimized() override;
913
914 void addPreRegAlloc() override;
915 bool addPreRewrite() override;
916 void addPostRegAlloc() override;
917 void addPreSched2() override;
918 void addPreEmitPass() override;
919};
920
921} // end anonymous namespace
922
923AMDGPUPassConfig::AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
924 : TargetPassConfig(TM, PM) {
925 // Exceptions and StackMaps are not supported, so these passes will never do
926 // anything.
927 disablePass(PassID: &StackMapLivenessID);
928 disablePass(PassID: &FuncletLayoutID);
929 // Garbage collection is not supported.
930 disablePass(PassID: &GCLoweringID);
931 disablePass(PassID: &ShadowStackGCLoweringID);
932}
933
934void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
935 if (getOptLevel() == CodeGenOptLevel::Aggressive)
936 addPass(P: createGVNPass());
937 else
938 addPass(P: createEarlyCSEPass());
939}
940
941void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
942 if (isPassEnabled(Opt: EnableLoopPrefetch, Level: CodeGenOptLevel::Aggressive))
943 addPass(P: createLoopDataPrefetchPass());
944 addPass(P: createSeparateConstOffsetFromGEPPass());
945 // ReassociateGEPs exposes more opportunities for SLSR. See
946 // the example in reassociate-geps-and-slsr.ll.
947 addPass(P: createStraightLineStrengthReducePass());
948 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
949 // EarlyCSE can reuse.
950 addEarlyCSEOrGVNPass();
951 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
952 addPass(P: createNaryReassociatePass());
953 // NaryReassociate on GEPs creates redundant common expressions, so run
954 // EarlyCSE after it.
955 addPass(P: createEarlyCSEPass());
956}
957
958void AMDGPUPassConfig::addIRPasses() {
959 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
960
961 Triple::ArchType Arch = TM.getTargetTriple().getArch();
962 if (RemoveIncompatibleFunctions && Arch == Triple::amdgcn)
963 addPass(P: createAMDGPURemoveIncompatibleFunctionsPass(&TM));
964
965 // There is no reason to run these.
966 disablePass(PassID: &StackMapLivenessID);
967 disablePass(PassID: &FuncletLayoutID);
968 disablePass(PassID: &PatchableFunctionID);
969
970 addPass(P: createAMDGPUPrintfRuntimeBinding());
971 if (LowerCtorDtor)
972 addPass(P: createAMDGPUCtorDtorLoweringLegacyPass());
973
974 if (isPassEnabled(Opt: EnableImageIntrinsicOptimizer))
975 addPass(P: createAMDGPUImageIntrinsicOptimizerPass(&TM));
976
977 // Function calls are not supported, so make sure we inline everything.
978 addPass(P: createAMDGPUAlwaysInlinePass());
979 addPass(P: createAlwaysInlinerLegacyPass());
980
981 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
982 if (Arch == Triple::r600)
983 addPass(P: createR600OpenCLImageTypeLoweringPass());
984
985 // Replace OpenCL enqueued block function pointers with global variables.
986 addPass(P: createAMDGPUOpenCLEnqueuedBlockLoweringPass());
987
988 // Runs before PromoteAlloca so the latter can account for function uses
989 if (EnableLowerModuleLDS) {
990 addPass(P: createAMDGPULowerModuleLDSLegacyPass(TM: &TM));
991 }
992
993 // AMDGPUAttributor infers lack of llvm.amdgcn.lds.kernel.id calls, so run
994 // after their introduction
995 if (TM.getOptLevel() > CodeGenOptLevel::None)
996 addPass(P: createAMDGPUAttributorLegacyPass());
997
998 if (TM.getOptLevel() > CodeGenOptLevel::None)
999 addPass(P: createInferAddressSpacesPass());
1000
1001 // Run atomic optimizer before Atomic Expand
1002 if ((TM.getTargetTriple().getArch() == Triple::amdgcn) &&
1003 (TM.getOptLevel() >= CodeGenOptLevel::Less) &&
1004 (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) {
1005 addPass(P: createAMDGPUAtomicOptimizerPass(ScanStrategy: AMDGPUAtomicOptimizerStrategy));
1006 }
1007
1008 addPass(P: createAtomicExpandLegacyPass());
1009
1010 if (TM.getOptLevel() > CodeGenOptLevel::None) {
1011 addPass(P: createAMDGPUPromoteAlloca());
1012
1013 if (isPassEnabled(Opt: EnableScalarIRPasses))
1014 addStraightLineScalarOptimizationPasses();
1015
1016 if (EnableAMDGPUAliasAnalysis) {
1017 addPass(P: createAMDGPUAAWrapperPass());
1018 addPass(P: createExternalAAWrapperPass(Callback: [](Pass &P, Function &,
1019 AAResults &AAR) {
1020 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
1021 AAR.addAAResult(AAResult&: WrapperPass->getResult());
1022 }));
1023 }
1024
1025 if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
1026 // TODO: May want to move later or split into an early and late one.
1027 addPass(P: createAMDGPUCodeGenPreparePass());
1028 }
1029
1030 // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
1031 // have expanded.
1032 if (TM.getOptLevel() > CodeGenOptLevel::Less)
1033 addPass(P: createLICMPass());
1034 }
1035
1036 TargetPassConfig::addIRPasses();
1037
1038 // EarlyCSE is not always strong enough to clean up what LSR produces. For
1039 // example, GVN can combine
1040 //
1041 // %0 = add %a, %b
1042 // %1 = add %b, %a
1043 //
1044 // and
1045 //
1046 // %0 = shl nsw %a, 2
1047 // %1 = shl %a, 2
1048 //
1049 // but EarlyCSE can do neither of them.
1050 if (isPassEnabled(Opt: EnableScalarIRPasses))
1051 addEarlyCSEOrGVNPass();
1052}
1053
1054void AMDGPUPassConfig::addCodeGenPrepare() {
1055 if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
1056 // FIXME: This pass adds 2 hacky attributes that can be replaced with an
1057 // analysis, and should be removed.
1058 addPass(P: createAMDGPUAnnotateKernelFeaturesPass());
1059 }
1060
1061 if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
1062 EnableLowerKernelArguments)
1063 addPass(P: createAMDGPULowerKernelArgumentsPass());
1064
1065 if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
1066 // This lowering has been placed after codegenprepare to take advantage of
1067 // address mode matching (which is why it isn't put with the LDS lowerings).
1068 // It could be placed anywhere before uniformity annotations (an analysis
1069 // that it changes by splitting up fat pointers into their components)
1070 // but has been put before switch lowering and CFG flattening so that those
1071 // passes can run on the more optimized control flow this pass creates in
1072 // many cases.
1073 //
1074 // FIXME: This should ideally be put after the LoadStoreVectorizer.
1075 // However, due to some annoying facts about ResourceUsageAnalysis,
1076 // (especially as exercised in the resource-usage-dead-function test),
1077 // we need all the function passes codegenprepare all the way through
1078 // said resource usage analysis to run on the call graph produced
1079 // before codegenprepare runs (because codegenprepare will knock some
1080 // nodes out of the graph, which leads to function-level passes not
1081 // being run on them, which causes crashes in the resource usage analysis).
1082 addPass(P: createAMDGPULowerBufferFatPointersPass());
1083 // In accordance with the above FIXME, manually force all the
1084 // function-level passes into a CGSCCPassManager.
1085 addPass(P: new DummyCGSCCPass());
1086 }
1087
1088 TargetPassConfig::addCodeGenPrepare();
1089
1090 if (isPassEnabled(Opt: EnableLoadStoreVectorizer))
1091 addPass(P: createLoadStoreVectorizerPass());
1092
1093 // LowerSwitch pass may introduce unreachable blocks that can
1094 // cause unexpected behavior for subsequent passes. Placing it
1095 // here seems better that these blocks would get cleaned up by
1096 // UnreachableBlockElim inserted next in the pass flow.
1097 addPass(P: createLowerSwitchPass());
1098}
1099
1100bool AMDGPUPassConfig::addPreISel() {
1101 if (TM->getOptLevel() > CodeGenOptLevel::None)
1102 addPass(P: createFlattenCFGPass());
1103 return false;
1104}
1105
1106bool AMDGPUPassConfig::addInstSelector() {
1107 addPass(P: createAMDGPUISelDag(TM&: getAMDGPUTargetMachine(), OptLevel: getOptLevel()));
1108 return false;
1109}
1110
1111bool AMDGPUPassConfig::addGCPasses() {
1112 // Do nothing. GC is not supported.
1113 return false;
1114}
1115
1116llvm::ScheduleDAGInstrs *
1117AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const {
1118 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1119 ScheduleDAGMILive *DAG = createGenericSchedLive(C);
1120 DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
1121 if (ST.shouldClusterStores())
1122 DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI));
1123 return DAG;
1124}
1125
1126MachineFunctionInfo *R600TargetMachine::createMachineFunctionInfo(
1127 BumpPtrAllocator &Allocator, const Function &F,
1128 const TargetSubtargetInfo *STI) const {
1129 return R600MachineFunctionInfo::create<R600MachineFunctionInfo>(
1130 Allocator, F, static_cast<const R600Subtarget *>(STI));
1131}
1132
1133//===----------------------------------------------------------------------===//
1134// GCN Pass Setup
1135//===----------------------------------------------------------------------===//
1136
1137ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
1138 MachineSchedContext *C) const {
1139 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1140 if (ST.enableSIScheduler())
1141 return createSIMachineScheduler(C);
1142
1143 if (EnableMaxIlpSchedStrategy)
1144 return createGCNMaxILPMachineScheduler(C);
1145
1146 return createGCNMaxOccupancyMachineScheduler(C);
1147}
1148
1149bool GCNPassConfig::addPreISel() {
1150 AMDGPUPassConfig::addPreISel();
1151
1152 if (TM->getOptLevel() > CodeGenOptLevel::None)
1153 addPass(P: createAMDGPULateCodeGenPreparePass());
1154
1155 if (TM->getOptLevel() > CodeGenOptLevel::None)
1156 addPass(P: createSinkingPass());
1157
1158 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
1159 // regions formed by them.
1160 addPass(PassID: &AMDGPUUnifyDivergentExitNodesID);
1161 if (!LateCFGStructurize && !DisableStructurizer) {
1162 if (EnableStructurizerWorkarounds) {
1163 addPass(P: createFixIrreduciblePass());
1164 addPass(P: createUnifyLoopExitsPass());
1165 }
1166 addPass(P: createStructurizeCFGPass(SkipUniformRegions: false)); // true -> SkipUniformRegions
1167 }
1168 addPass(P: createAMDGPUAnnotateUniformValues());
1169 if (!LateCFGStructurize && !DisableStructurizer) {
1170 addPass(P: createSIAnnotateControlFlowPass());
1171 // TODO: Move this right after structurizeCFG to avoid extra divergence
1172 // analysis. This depends on stopping SIAnnotateControlFlow from making
1173 // control flow modifications.
1174 addPass(P: createAMDGPURewriteUndefForPHILegacyPass());
1175 }
1176 addPass(P: createLCSSAPass());
1177
1178 if (TM->getOptLevel() > CodeGenOptLevel::Less)
1179 addPass(PassID: &AMDGPUPerfHintAnalysisID);
1180
1181 return false;
1182}
1183
1184void GCNPassConfig::addMachineSSAOptimization() {
1185 TargetPassConfig::addMachineSSAOptimization();
1186
1187 // We want to fold operands after PeepholeOptimizer has run (or as part of
1188 // it), because it will eliminate extra copies making it easier to fold the
1189 // real source operand. We want to eliminate dead instructions after, so that
1190 // we see fewer uses of the copies. We then need to clean up the dead
1191 // instructions leftover after the operands are folded as well.
1192 //
1193 // XXX - Can we get away without running DeadMachineInstructionElim again?
1194 addPass(PassID: &SIFoldOperandsID);
1195 if (EnableDPPCombine)
1196 addPass(PassID: &GCNDPPCombineID);
1197 addPass(PassID: &SILoadStoreOptimizerID);
1198 if (isPassEnabled(Opt: EnableSDWAPeephole)) {
1199 addPass(PassID: &SIPeepholeSDWAID);
1200 addPass(PassID: &EarlyMachineLICMID);
1201 addPass(PassID: &MachineCSEID);
1202 addPass(PassID: &SIFoldOperandsID);
1203 }
1204 addPass(PassID: &DeadMachineInstructionElimID);
1205 addPass(P: createSIShrinkInstructionsPass());
1206}
1207
1208bool GCNPassConfig::addILPOpts() {
1209 if (EnableEarlyIfConversion)
1210 addPass(PassID: &EarlyIfConverterID);
1211
1212 TargetPassConfig::addILPOpts();
1213 return false;
1214}
1215
1216bool GCNPassConfig::addInstSelector() {
1217 AMDGPUPassConfig::addInstSelector();
1218 addPass(PassID: &SIFixSGPRCopiesID);
1219 addPass(P: createSILowerI1CopiesPass());
1220 return false;
1221}
1222
1223bool GCNPassConfig::addIRTranslator() {
1224 addPass(P: new IRTranslator(getOptLevel()));
1225 return false;
1226}
1227
1228void GCNPassConfig::addPreLegalizeMachineIR() {
1229 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1230 addPass(P: createAMDGPUPreLegalizeCombiner(IsOptNone));
1231 addPass(P: new Localizer());
1232}
1233
1234bool GCNPassConfig::addLegalizeMachineIR() {
1235 addPass(P: new Legalizer());
1236 return false;
1237}
1238
1239void GCNPassConfig::addPreRegBankSelect() {
1240 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1241 addPass(P: createAMDGPUPostLegalizeCombiner(IsOptNone));
1242 addPass(P: createAMDGPUGlobalISelDivergenceLoweringPass());
1243}
1244
1245bool GCNPassConfig::addRegBankSelect() {
1246 addPass(P: new AMDGPURegBankSelect());
1247 return false;
1248}
1249
1250void GCNPassConfig::addPreGlobalInstructionSelect() {
1251 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1252 addPass(P: createAMDGPURegBankCombiner(IsOptNone));
1253}
1254
1255bool GCNPassConfig::addGlobalInstructionSelect() {
1256 addPass(P: new InstructionSelect(getOptLevel()));
1257 return false;
1258}
1259
1260void GCNPassConfig::addPreRegAlloc() {
1261 if (LateCFGStructurize) {
1262 addPass(P: createAMDGPUMachineCFGStructurizerPass());
1263 }
1264}
1265
1266void GCNPassConfig::addFastRegAlloc() {
1267 // FIXME: We have to disable the verifier here because of PHIElimination +
1268 // TwoAddressInstructions disabling it.
1269
1270 // This must be run immediately after phi elimination and before
1271 // TwoAddressInstructions, otherwise the processing of the tied operand of
1272 // SI_ELSE will introduce a copy of the tied operand source after the else.
1273 insertPass(TargetPassID: &PHIEliminationID, InsertedPassID: &SILowerControlFlowID);
1274
1275 insertPass(TargetPassID: &TwoAddressInstructionPassID, InsertedPassID: &SIWholeQuadModeID);
1276
1277 TargetPassConfig::addFastRegAlloc();
1278}
1279
1280void GCNPassConfig::addOptimizedRegAlloc() {
1281 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
1282 // instructions that cause scheduling barriers.
1283 insertPass(TargetPassID: &MachineSchedulerID, InsertedPassID: &SIWholeQuadModeID);
1284
1285 if (OptExecMaskPreRA)
1286 insertPass(TargetPassID: &MachineSchedulerID, InsertedPassID: &SIOptimizeExecMaskingPreRAID);
1287
1288 if (EnableRewritePartialRegUses)
1289 insertPass(TargetPassID: &RenameIndependentSubregsID, InsertedPassID: &GCNRewritePartialRegUsesID);
1290
1291 if (isPassEnabled(Opt: EnablePreRAOptimizations))
1292 insertPass(TargetPassID: &RenameIndependentSubregsID, InsertedPassID: &GCNPreRAOptimizationsID);
1293
1294 // This is not an essential optimization and it has a noticeable impact on
1295 // compilation time, so we only enable it from O2.
1296 if (TM->getOptLevel() > CodeGenOptLevel::Less)
1297 insertPass(TargetPassID: &MachineSchedulerID, InsertedPassID: &SIFormMemoryClausesID);
1298
1299 // FIXME: when an instruction has a Killed operand, and the instruction is
1300 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
1301 // the register in LiveVariables, this would trigger a failure in verifier,
1302 // we should fix it and enable the verifier.
1303 if (OptVGPRLiveRange)
1304 insertPass(TargetPassID: &LiveVariablesID, InsertedPassID: &SIOptimizeVGPRLiveRangeID);
1305 // This must be run immediately after phi elimination and before
1306 // TwoAddressInstructions, otherwise the processing of the tied operand of
1307 // SI_ELSE will introduce a copy of the tied operand source after the else.
1308 insertPass(TargetPassID: &PHIEliminationID, InsertedPassID: &SILowerControlFlowID);
1309
1310 if (EnableDCEInRA)
1311 insertPass(TargetPassID: &DetectDeadLanesID, InsertedPassID: &DeadMachineInstructionElimID);
1312
1313 TargetPassConfig::addOptimizedRegAlloc();
1314}
1315
1316bool GCNPassConfig::addPreRewrite() {
1317 addPass(PassID: &SILowerWWMCopiesID);
1318 if (EnableRegReassign)
1319 addPass(PassID: &GCNNSAReassignID);
1320 return true;
1321}
1322
1323FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
1324 // Initialize the global default.
1325 llvm::call_once(flag&: InitializeDefaultSGPRRegisterAllocatorFlag,
1326 F&: initializeDefaultSGPRRegisterAllocatorOnce);
1327
1328 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
1329 if (Ctor != useDefaultRegisterAllocator)
1330 return Ctor();
1331
1332 if (Optimized)
1333 return createGreedyRegisterAllocator(F: onlyAllocateSGPRs);
1334
1335 return createFastRegisterAllocator(F: onlyAllocateSGPRs, ClearVirtRegs: false);
1336}
1337
1338FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
1339 // Initialize the global default.
1340 llvm::call_once(flag&: InitializeDefaultVGPRRegisterAllocatorFlag,
1341 F&: initializeDefaultVGPRRegisterAllocatorOnce);
1342
1343 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
1344 if (Ctor != useDefaultRegisterAllocator)
1345 return Ctor();
1346
1347 if (Optimized)
1348 return createGreedyVGPRRegisterAllocator();
1349
1350 return createFastVGPRRegisterAllocator();
1351}
1352
1353FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
1354 llvm_unreachable("should not be used");
1355}
1356
1357static const char RegAllocOptNotSupportedMessage[] =
1358 "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
1359
1360bool GCNPassConfig::addRegAssignAndRewriteFast() {
1361 if (!usingDefaultRegAlloc())
1362 report_fatal_error(reason: RegAllocOptNotSupportedMessage);
1363
1364 addPass(PassID: &GCNPreRALongBranchRegID);
1365
1366 addPass(P: createSGPRAllocPass(Optimized: false));
1367
1368 // Equivalent of PEI for SGPRs.
1369 addPass(PassID: &SILowerSGPRSpillsID);
1370 addPass(PassID: &SIPreAllocateWWMRegsID);
1371
1372 addPass(P: createVGPRAllocPass(Optimized: false));
1373
1374 addPass(PassID: &SILowerWWMCopiesID);
1375 return true;
1376}
1377
1378bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
1379 if (!usingDefaultRegAlloc())
1380 report_fatal_error(reason: RegAllocOptNotSupportedMessage);
1381
1382 addPass(PassID: &GCNPreRALongBranchRegID);
1383
1384 addPass(P: createSGPRAllocPass(Optimized: true));
1385
1386 // Commit allocated register changes. This is mostly necessary because too
1387 // many things rely on the use lists of the physical registers, such as the
1388 // verifier. This is only necessary with allocators which use LiveIntervals,
1389 // since FastRegAlloc does the replacements itself.
1390 addPass(P: createVirtRegRewriter(ClearVirtRegs: false));
1391
1392 // Equivalent of PEI for SGPRs.
1393 addPass(PassID: &SILowerSGPRSpillsID);
1394 addPass(PassID: &SIPreAllocateWWMRegsID);
1395
1396 addPass(P: createVGPRAllocPass(Optimized: true));
1397
1398 addPreRewrite();
1399 addPass(PassID: &VirtRegRewriterID);
1400
1401 addPass(PassID: &AMDGPUMarkLastScratchLoadID);
1402
1403 return true;
1404}
1405
1406void GCNPassConfig::addPostRegAlloc() {
1407 addPass(PassID: &SIFixVGPRCopiesID);
1408 if (getOptLevel() > CodeGenOptLevel::None)
1409 addPass(PassID: &SIOptimizeExecMaskingID);
1410 TargetPassConfig::addPostRegAlloc();
1411}
1412
1413void GCNPassConfig::addPreSched2() {
1414 if (TM->getOptLevel() > CodeGenOptLevel::None)
1415 addPass(P: createSIShrinkInstructionsPass());
1416 addPass(PassID: &SIPostRABundlerID);
1417}
1418
1419void GCNPassConfig::addPreEmitPass() {
1420 if (isPassEnabled(Opt: EnableVOPD, Level: CodeGenOptLevel::Less))
1421 addPass(PassID: &GCNCreateVOPDID);
1422 addPass(P: createSIMemoryLegalizerPass());
1423 addPass(P: createSIInsertWaitcntsPass());
1424
1425 addPass(P: createSIModeRegisterPass());
1426
1427 if (getOptLevel() > CodeGenOptLevel::None)
1428 addPass(PassID: &SIInsertHardClausesID);
1429
1430 addPass(PassID: &SILateBranchLoweringPassID);
1431 if (isPassEnabled(Opt: EnableSetWavePriority, Level: CodeGenOptLevel::Less))
1432 addPass(P: createAMDGPUSetWavePriorityPass());
1433 if (getOptLevel() > CodeGenOptLevel::None)
1434 addPass(PassID: &SIPreEmitPeepholeID);
1435 // The hazard recognizer that runs as part of the post-ra scheduler does not
1436 // guarantee to be able handle all hazards correctly. This is because if there
1437 // are multiple scheduling regions in a basic block, the regions are scheduled
1438 // bottom up, so when we begin to schedule a region we don't know what
1439 // instructions were emitted directly before it.
1440 //
1441 // Here we add a stand-alone hazard recognizer pass which can handle all
1442 // cases.
1443 addPass(PassID: &PostRAHazardRecognizerID);
1444
1445 if (isPassEnabled(Opt: EnableInsertSingleUseVDST, Level: CodeGenOptLevel::Less))
1446 addPass(PassID: &AMDGPUInsertSingleUseVDSTID);
1447
1448 if (isPassEnabled(Opt: EnableInsertDelayAlu, Level: CodeGenOptLevel::Less))
1449 addPass(PassID: &AMDGPUInsertDelayAluID);
1450
1451 addPass(PassID: &BranchRelaxationPassID);
1452}
1453
1454TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
1455 return new GCNPassConfig(*this, PM);
1456}
1457
1458void GCNTargetMachine::registerMachineRegisterInfoCallback(
1459 MachineFunction &MF) const {
1460 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1461 MF.getRegInfo().addDelegate(delegate: MFI);
1462}
1463
1464MachineFunctionInfo *GCNTargetMachine::createMachineFunctionInfo(
1465 BumpPtrAllocator &Allocator, const Function &F,
1466 const TargetSubtargetInfo *STI) const {
1467 return SIMachineFunctionInfo::create<SIMachineFunctionInfo>(
1468 Allocator, F, static_cast<const GCNSubtarget *>(STI));
1469}
1470
1471yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
1472 return new yaml::SIMachineFunctionInfo();
1473}
1474
1475yaml::MachineFunctionInfo *
1476GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
1477 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1478 return new yaml::SIMachineFunctionInfo(
1479 *MFI, *MF.getSubtarget<GCNSubtarget>().getRegisterInfo(), MF);
1480}
1481
1482bool GCNTargetMachine::parseMachineFunctionInfo(
1483 const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
1484 SMDiagnostic &Error, SMRange &SourceRange) const {
1485 const yaml::SIMachineFunctionInfo &YamlMFI =
1486 static_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1487 MachineFunction &MF = PFS.MF;
1488 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1489 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1490
1491 if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
1492 return true;
1493
1494 if (MFI->Occupancy == 0) {
1495 // Fixup the subtarget dependent default value.
1496 MFI->Occupancy = ST.computeOccupancy(F: MF.getFunction(), LDSSize: MFI->getLDSSize());
1497 }
1498
1499 auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
1500 Register TempReg;
1501 if (parseNamedRegisterReference(PFS, Reg&: TempReg, Src: RegName.Value, Error)) {
1502 SourceRange = RegName.SourceRange;
1503 return true;
1504 }
1505 RegVal = TempReg;
1506
1507 return false;
1508 };
1509
1510 auto parseOptionalRegister = [&](const yaml::StringValue &RegName,
1511 Register &RegVal) {
1512 return !RegName.Value.empty() && parseRegister(RegName, RegVal);
1513 };
1514
1515 if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy))
1516 return true;
1517
1518 if (parseOptionalRegister(YamlMFI.SGPRForEXECCopy, MFI->SGPRForEXECCopy))
1519 return true;
1520
1521 if (parseOptionalRegister(YamlMFI.LongBranchReservedReg,
1522 MFI->LongBranchReservedReg))
1523 return true;
1524
1525 auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
1526 // Create a diagnostic for a the register string literal.
1527 const MemoryBuffer &Buffer =
1528 *PFS.SM->getMemoryBuffer(i: PFS.SM->getMainFileID());
1529 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
1530 RegName.Value.size(), SourceMgr::DK_Error,
1531 "incorrect register class for field", RegName.Value,
1532 std::nullopt, std::nullopt);
1533 SourceRange = RegName.SourceRange;
1534 return true;
1535 };
1536
1537 if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
1538 parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
1539 parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
1540 return true;
1541
1542 if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
1543 !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
1544 return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
1545 }
1546
1547 if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
1548 !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
1549 return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
1550 }
1551
1552 if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
1553 !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) {
1554 return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
1555 }
1556
1557 for (const auto &YamlReg : YamlMFI.WWMReservedRegs) {
1558 Register ParsedReg;
1559 if (parseRegister(YamlReg, ParsedReg))
1560 return true;
1561
1562 MFI->reserveWWMRegister(Reg: ParsedReg);
1563 }
1564
1565 auto parseAndCheckArgument = [&](const std::optional<yaml::SIArgument> &A,
1566 const TargetRegisterClass &RC,
1567 ArgDescriptor &Arg, unsigned UserSGPRs,
1568 unsigned SystemSGPRs) {
1569 // Skip parsing if it's not present.
1570 if (!A)
1571 return false;
1572
1573 if (A->IsRegister) {
1574 Register Reg;
1575 if (parseNamedRegisterReference(PFS, Reg, Src: A->RegisterName.Value, Error)) {
1576 SourceRange = A->RegisterName.SourceRange;
1577 return true;
1578 }
1579 if (!RC.contains(Reg))
1580 return diagnoseRegisterClass(A->RegisterName);
1581 Arg = ArgDescriptor::createRegister(Reg);
1582 } else
1583 Arg = ArgDescriptor::createStack(Offset: A->StackOffset);
1584 // Check and apply the optional mask.
1585 if (A->Mask)
1586 Arg = ArgDescriptor::createArg(Arg, Mask: *A->Mask);
1587
1588 MFI->NumUserSGPRs += UserSGPRs;
1589 MFI->NumSystemSGPRs += SystemSGPRs;
1590 return false;
1591 };
1592
1593 if (YamlMFI.ArgInfo &&
1594 (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
1595 AMDGPU::SGPR_128RegClass,
1596 MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
1597 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
1598 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
1599 2, 0) ||
1600 parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
1601 MFI->ArgInfo.QueuePtr, 2, 0) ||
1602 parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
1603 AMDGPU::SReg_64RegClass,
1604 MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
1605 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
1606 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
1607 2, 0) ||
1608 parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
1609 AMDGPU::SReg_64RegClass,
1610 MFI->ArgInfo.FlatScratchInit, 2, 0) ||
1611 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
1612 AMDGPU::SGPR_32RegClass,
1613 MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
1614 parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId,
1615 AMDGPU::SGPR_32RegClass,
1616 MFI->ArgInfo.LDSKernelId, 0, 1) ||
1617 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
1618 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
1619 0, 1) ||
1620 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
1621 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
1622 0, 1) ||
1623 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
1624 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
1625 0, 1) ||
1626 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
1627 AMDGPU::SGPR_32RegClass,
1628 MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
1629 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
1630 AMDGPU::SGPR_32RegClass,
1631 MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
1632 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
1633 AMDGPU::SReg_64RegClass,
1634 MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
1635 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
1636 AMDGPU::SReg_64RegClass,
1637 MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
1638 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
1639 AMDGPU::VGPR_32RegClass,
1640 MFI->ArgInfo.WorkItemIDX, 0, 0) ||
1641 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
1642 AMDGPU::VGPR_32RegClass,
1643 MFI->ArgInfo.WorkItemIDY, 0, 0) ||
1644 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
1645 AMDGPU::VGPR_32RegClass,
1646 MFI->ArgInfo.WorkItemIDZ, 0, 0)))
1647 return true;
1648
1649 if (ST.hasIEEEMode())
1650 MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
1651 if (ST.hasDX10ClampMode())
1652 MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
1653
1654 // FIXME: Move proper support for denormal-fp-math into base MachineFunction
1655 MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals
1656 ? DenormalMode::IEEE
1657 : DenormalMode::PreserveSign;
1658 MFI->Mode.FP32Denormals.Output = YamlMFI.Mode.FP32OutputDenormals
1659 ? DenormalMode::IEEE
1660 : DenormalMode::PreserveSign;
1661
1662 MFI->Mode.FP64FP16Denormals.Input = YamlMFI.Mode.FP64FP16InputDenormals
1663 ? DenormalMode::IEEE
1664 : DenormalMode::PreserveSign;
1665 MFI->Mode.FP64FP16Denormals.Output = YamlMFI.Mode.FP64FP16OutputDenormals
1666 ? DenormalMode::IEEE
1667 : DenormalMode::PreserveSign;
1668
1669 return false;
1670}
1671

source code of llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp