1 | //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// The AMDGPU target machine contains all of the hardware specific |
11 | /// information needed to emit code for SI+ GPUs. |
12 | // |
13 | //===----------------------------------------------------------------------===// |
14 | |
15 | #include "AMDGPUTargetMachine.h" |
16 | #include "AMDGPU.h" |
17 | #include "AMDGPUAliasAnalysis.h" |
18 | #include "AMDGPUCtorDtorLowering.h" |
19 | #include "AMDGPUExportClustering.h" |
20 | #include "AMDGPUIGroupLP.h" |
21 | #include "AMDGPUMacroFusion.h" |
22 | #include "AMDGPURegBankSelect.h" |
23 | #include "AMDGPUTargetObjectFile.h" |
24 | #include "AMDGPUTargetTransformInfo.h" |
25 | #include "AMDGPUUnifyDivergentExitNodes.h" |
26 | #include "GCNIterativeScheduler.h" |
27 | #include "GCNSchedStrategy.h" |
28 | #include "GCNVOPDUtils.h" |
29 | #include "R600.h" |
30 | #include "R600MachineFunctionInfo.h" |
31 | #include "R600TargetMachine.h" |
32 | #include "SIMachineFunctionInfo.h" |
33 | #include "SIMachineScheduler.h" |
34 | #include "TargetInfo/AMDGPUTargetInfo.h" |
35 | #include "Utils/AMDGPUBaseInfo.h" |
36 | #include "llvm/Analysis/CGSCCPassManager.h" |
37 | #include "llvm/Analysis/CallGraphSCCPass.h" |
38 | #include "llvm/CodeGen/GlobalISel/CSEInfo.h" |
39 | #include "llvm/CodeGen/GlobalISel/IRTranslator.h" |
40 | #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" |
41 | #include "llvm/CodeGen/GlobalISel/Legalizer.h" |
42 | #include "llvm/CodeGen/GlobalISel/Localizer.h" |
43 | #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" |
44 | #include "llvm/CodeGen/MIRParser/MIParser.h" |
45 | #include "llvm/CodeGen/Passes.h" |
46 | #include "llvm/CodeGen/RegAllocRegistry.h" |
47 | #include "llvm/CodeGen/TargetPassConfig.h" |
48 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
49 | #include "llvm/IR/PassManager.h" |
50 | #include "llvm/IR/PatternMatch.h" |
51 | #include "llvm/InitializePasses.h" |
52 | #include "llvm/MC/TargetRegistry.h" |
53 | #include "llvm/Passes/PassBuilder.h" |
54 | #include "llvm/Transforms/HipStdPar/HipStdPar.h" |
55 | #include "llvm/Transforms/IPO.h" |
56 | #include "llvm/Transforms/IPO/AlwaysInliner.h" |
57 | #include "llvm/Transforms/IPO/GlobalDCE.h" |
58 | #include "llvm/Transforms/IPO/Internalize.h" |
59 | #include "llvm/Transforms/Scalar.h" |
60 | #include "llvm/Transforms/Scalar/GVN.h" |
61 | #include "llvm/Transforms/Scalar/InferAddressSpaces.h" |
62 | #include "llvm/Transforms/Utils.h" |
63 | #include "llvm/Transforms/Utils/SimplifyLibCalls.h" |
64 | #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" |
65 | #include <optional> |
66 | |
67 | using namespace llvm; |
68 | using namespace llvm::PatternMatch; |
69 | |
70 | namespace { |
71 | class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> { |
72 | public: |
73 | SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) |
74 | : RegisterRegAllocBase(N, D, C) {} |
75 | }; |
76 | |
77 | class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> { |
78 | public: |
79 | VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) |
80 | : RegisterRegAllocBase(N, D, C) {} |
81 | }; |
82 | |
83 | static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI, |
84 | const TargetRegisterClass &RC) { |
85 | return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC); |
86 | } |
87 | |
88 | static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI, |
89 | const TargetRegisterClass &RC) { |
90 | return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC); |
91 | } |
92 | |
93 | |
94 | /// -{sgpr|vgpr}-regalloc=... command line option. |
95 | static FunctionPass *useDefaultRegisterAllocator() { return nullptr; } |
96 | |
97 | /// A dummy default pass factory indicates whether the register allocator is |
98 | /// overridden on the command line. |
99 | static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag; |
100 | static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag; |
101 | |
102 | static SGPRRegisterRegAlloc |
103 | defaultSGPRRegAlloc("default" , |
104 | "pick SGPR register allocator based on -O option" , |
105 | useDefaultRegisterAllocator); |
106 | |
107 | static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false, |
108 | RegisterPassParser<SGPRRegisterRegAlloc>> |
109 | SGPRRegAlloc("sgpr-regalloc" , cl::Hidden, cl::init(Val: &useDefaultRegisterAllocator), |
110 | cl::desc("Register allocator to use for SGPRs" )); |
111 | |
112 | static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false, |
113 | RegisterPassParser<VGPRRegisterRegAlloc>> |
114 | VGPRRegAlloc("vgpr-regalloc" , cl::Hidden, cl::init(Val: &useDefaultRegisterAllocator), |
115 | cl::desc("Register allocator to use for VGPRs" )); |
116 | |
117 | |
118 | static void initializeDefaultSGPRRegisterAllocatorOnce() { |
119 | RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); |
120 | |
121 | if (!Ctor) { |
122 | Ctor = SGPRRegAlloc; |
123 | SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc); |
124 | } |
125 | } |
126 | |
127 | static void initializeDefaultVGPRRegisterAllocatorOnce() { |
128 | RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); |
129 | |
130 | if (!Ctor) { |
131 | Ctor = VGPRRegAlloc; |
132 | VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc); |
133 | } |
134 | } |
135 | |
136 | static FunctionPass *createBasicSGPRRegisterAllocator() { |
137 | return createBasicRegisterAllocator(F: onlyAllocateSGPRs); |
138 | } |
139 | |
140 | static FunctionPass *createGreedySGPRRegisterAllocator() { |
141 | return createGreedyRegisterAllocator(F: onlyAllocateSGPRs); |
142 | } |
143 | |
144 | static FunctionPass *createFastSGPRRegisterAllocator() { |
145 | return createFastRegisterAllocator(F: onlyAllocateSGPRs, ClearVirtRegs: false); |
146 | } |
147 | |
148 | static FunctionPass *createBasicVGPRRegisterAllocator() { |
149 | return createBasicRegisterAllocator(F: onlyAllocateVGPRs); |
150 | } |
151 | |
152 | static FunctionPass *createGreedyVGPRRegisterAllocator() { |
153 | return createGreedyRegisterAllocator(F: onlyAllocateVGPRs); |
154 | } |
155 | |
156 | static FunctionPass *createFastVGPRRegisterAllocator() { |
157 | return createFastRegisterAllocator(F: onlyAllocateVGPRs, ClearVirtRegs: true); |
158 | } |
159 | |
160 | static SGPRRegisterRegAlloc basicRegAllocSGPR( |
161 | "basic" , "basic register allocator" , createBasicSGPRRegisterAllocator); |
162 | static SGPRRegisterRegAlloc greedyRegAllocSGPR( |
163 | "greedy" , "greedy register allocator" , createGreedySGPRRegisterAllocator); |
164 | |
165 | static SGPRRegisterRegAlloc fastRegAllocSGPR( |
166 | "fast" , "fast register allocator" , createFastSGPRRegisterAllocator); |
167 | |
168 | |
169 | static VGPRRegisterRegAlloc basicRegAllocVGPR( |
170 | "basic" , "basic register allocator" , createBasicVGPRRegisterAllocator); |
171 | static VGPRRegisterRegAlloc greedyRegAllocVGPR( |
172 | "greedy" , "greedy register allocator" , createGreedyVGPRRegisterAllocator); |
173 | |
174 | static VGPRRegisterRegAlloc fastRegAllocVGPR( |
175 | "fast" , "fast register allocator" , createFastVGPRRegisterAllocator); |
176 | } |
177 | |
178 | static cl::opt<bool> |
179 | EnableEarlyIfConversion("amdgpu-early-ifcvt" , cl::Hidden, |
180 | cl::desc("Run early if-conversion" ), |
181 | cl::init(Val: false)); |
182 | |
183 | static cl::opt<bool> |
184 | OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra" , cl::Hidden, |
185 | cl::desc("Run pre-RA exec mask optimizations" ), |
186 | cl::init(Val: true)); |
187 | |
188 | static cl::opt<bool> |
189 | LowerCtorDtor("amdgpu-lower-global-ctor-dtor" , |
190 | cl::desc("Lower GPU ctor / dtors to globals on the device." ), |
191 | cl::init(Val: true), cl::Hidden); |
192 | |
193 | // Option to disable vectorizer for tests. |
194 | static cl::opt<bool> EnableLoadStoreVectorizer( |
195 | "amdgpu-load-store-vectorizer" , |
196 | cl::desc("Enable load store vectorizer" ), |
197 | cl::init(Val: true), |
198 | cl::Hidden); |
199 | |
200 | // Option to control global loads scalarization |
201 | static cl::opt<bool> ScalarizeGlobal( |
202 | "amdgpu-scalarize-global-loads" , |
203 | cl::desc("Enable global load scalarization" ), |
204 | cl::init(Val: true), |
205 | cl::Hidden); |
206 | |
207 | // Option to run internalize pass. |
208 | static cl::opt<bool> InternalizeSymbols( |
209 | "amdgpu-internalize-symbols" , |
210 | cl::desc("Enable elimination of non-kernel functions and unused globals" ), |
211 | cl::init(Val: false), |
212 | cl::Hidden); |
213 | |
214 | // Option to inline all early. |
215 | static cl::opt<bool> EarlyInlineAll( |
216 | "amdgpu-early-inline-all" , |
217 | cl::desc("Inline all functions early" ), |
218 | cl::init(Val: false), |
219 | cl::Hidden); |
220 | |
221 | static cl::opt<bool> RemoveIncompatibleFunctions( |
222 | "amdgpu-enable-remove-incompatible-functions" , cl::Hidden, |
223 | cl::desc("Enable removal of functions when they" |
224 | "use features not supported by the target GPU" ), |
225 | cl::init(Val: true)); |
226 | |
227 | static cl::opt<bool> EnableSDWAPeephole( |
228 | "amdgpu-sdwa-peephole" , |
229 | cl::desc("Enable SDWA peepholer" ), |
230 | cl::init(Val: true)); |
231 | |
232 | static cl::opt<bool> EnableDPPCombine( |
233 | "amdgpu-dpp-combine" , |
234 | cl::desc("Enable DPP combiner" ), |
235 | cl::init(Val: true)); |
236 | |
237 | // Enable address space based alias analysis |
238 | static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa" , cl::Hidden, |
239 | cl::desc("Enable AMDGPU Alias Analysis" ), |
240 | cl::init(Val: true)); |
241 | |
242 | // Option to run late CFG structurizer |
243 | static cl::opt<bool, true> LateCFGStructurize( |
244 | "amdgpu-late-structurize" , |
245 | cl::desc("Enable late CFG structurization" ), |
246 | cl::location(L&: AMDGPUTargetMachine::EnableLateStructurizeCFG), |
247 | cl::Hidden); |
248 | |
249 | // Disable structurizer-based control-flow lowering in order to test convergence |
250 | // control tokens. This should eventually be replaced by the wave-transform. |
251 | static cl::opt<bool, true> DisableStructurizer( |
252 | "amdgpu-disable-structurizer" , |
253 | cl::desc("Disable structurizer for experiments; produces unusable code" ), |
254 | cl::location(L&: AMDGPUTargetMachine::DisableStructurizer), cl::ReallyHidden); |
255 | |
256 | // Enable lib calls simplifications |
257 | static cl::opt<bool> EnableLibCallSimplify( |
258 | "amdgpu-simplify-libcall" , |
259 | cl::desc("Enable amdgpu library simplifications" ), |
260 | cl::init(Val: true), |
261 | cl::Hidden); |
262 | |
263 | static cl::opt<bool> EnableLowerKernelArguments( |
264 | "amdgpu-ir-lower-kernel-arguments" , |
265 | cl::desc("Lower kernel argument loads in IR pass" ), |
266 | cl::init(Val: true), |
267 | cl::Hidden); |
268 | |
269 | static cl::opt<bool> EnableRegReassign( |
270 | "amdgpu-reassign-regs" , |
271 | cl::desc("Enable register reassign optimizations on gfx10+" ), |
272 | cl::init(Val: true), |
273 | cl::Hidden); |
274 | |
275 | static cl::opt<bool> OptVGPRLiveRange( |
276 | "amdgpu-opt-vgpr-liverange" , |
277 | cl::desc("Enable VGPR liverange optimizations for if-else structure" ), |
278 | cl::init(Val: true), cl::Hidden); |
279 | |
280 | static cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy( |
281 | "amdgpu-atomic-optimizer-strategy" , |
282 | cl::desc("Select DPP or Iterative strategy for scan" ), |
283 | cl::init(Val: ScanOptions::Iterative), |
284 | cl::values( |
285 | clEnumValN(ScanOptions::DPP, "DPP" , "Use DPP operations for scan" ), |
286 | clEnumValN(ScanOptions::Iterative, "Iterative" , |
287 | "Use Iterative approach for scan" ), |
288 | clEnumValN(ScanOptions::None, "None" , "Disable atomic optimizer" ))); |
289 | |
290 | // Enable Mode register optimization |
291 | static cl::opt<bool> EnableSIModeRegisterPass( |
292 | "amdgpu-mode-register" , |
293 | cl::desc("Enable mode register pass" ), |
294 | cl::init(Val: true), |
295 | cl::Hidden); |
296 | |
297 | // Enable GFX11.5+ s_singleuse_vdst insertion |
298 | static cl::opt<bool> |
299 | EnableInsertSingleUseVDST("amdgpu-enable-single-use-vdst" , |
300 | cl::desc("Enable s_singleuse_vdst insertion" ), |
301 | cl::init(Val: false), cl::Hidden); |
302 | |
303 | // Enable GFX11+ s_delay_alu insertion |
304 | static cl::opt<bool> |
305 | EnableInsertDelayAlu("amdgpu-enable-delay-alu" , |
306 | cl::desc("Enable s_delay_alu insertion" ), |
307 | cl::init(Val: true), cl::Hidden); |
308 | |
309 | // Enable GFX11+ VOPD |
310 | static cl::opt<bool> |
311 | EnableVOPD("amdgpu-enable-vopd" , |
312 | cl::desc("Enable VOPD, dual issue of VALU in wave32" ), |
313 | cl::init(Val: true), cl::Hidden); |
314 | |
315 | // Option is used in lit tests to prevent deadcoding of patterns inspected. |
316 | static cl::opt<bool> |
317 | EnableDCEInRA("amdgpu-dce-in-ra" , |
318 | cl::init(Val: true), cl::Hidden, |
319 | cl::desc("Enable machine DCE inside regalloc" )); |
320 | |
321 | static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority" , |
322 | cl::desc("Adjust wave priority" ), |
323 | cl::init(Val: false), cl::Hidden); |
324 | |
325 | static cl::opt<bool> EnableScalarIRPasses( |
326 | "amdgpu-scalar-ir-passes" , |
327 | cl::desc("Enable scalar IR passes" ), |
328 | cl::init(Val: true), |
329 | cl::Hidden); |
330 | |
331 | static cl::opt<bool> EnableStructurizerWorkarounds( |
332 | "amdgpu-enable-structurizer-workarounds" , |
333 | cl::desc("Enable workarounds for the StructurizeCFG pass" ), cl::init(Val: true), |
334 | cl::Hidden); |
335 | |
336 | static cl::opt<bool, true> EnableLowerModuleLDS( |
337 | "amdgpu-enable-lower-module-lds" , cl::desc("Enable lower module lds pass" ), |
338 | cl::location(L&: AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(Val: true), |
339 | cl::Hidden); |
340 | |
341 | static cl::opt<bool> EnablePreRAOptimizations( |
342 | "amdgpu-enable-pre-ra-optimizations" , |
343 | cl::desc("Enable Pre-RA optimizations pass" ), cl::init(Val: true), |
344 | cl::Hidden); |
345 | |
346 | static cl::opt<bool> EnablePromoteKernelArguments( |
347 | "amdgpu-enable-promote-kernel-arguments" , |
348 | cl::desc("Enable promotion of flat kernel pointer arguments to global" ), |
349 | cl::Hidden, cl::init(Val: true)); |
350 | |
351 | static cl::opt<bool> EnableImageIntrinsicOptimizer( |
352 | "amdgpu-enable-image-intrinsic-optimizer" , |
353 | cl::desc("Enable image intrinsic optimizer pass" ), cl::init(Val: true), |
354 | cl::Hidden); |
355 | |
356 | static cl::opt<bool> |
357 | EnableLoopPrefetch("amdgpu-loop-prefetch" , |
358 | cl::desc("Enable loop data prefetch on AMDGPU" ), |
359 | cl::Hidden, cl::init(Val: false)); |
360 | |
361 | static cl::opt<bool> EnableMaxIlpSchedStrategy( |
362 | "amdgpu-enable-max-ilp-scheduling-strategy" , |
363 | cl::desc("Enable scheduling strategy to maximize ILP for a single wave." ), |
364 | cl::Hidden, cl::init(Val: false)); |
365 | |
366 | static cl::opt<bool> EnableRewritePartialRegUses( |
367 | "amdgpu-enable-rewrite-partial-reg-uses" , |
368 | cl::desc("Enable rewrite partial reg uses pass" ), cl::init(Val: true), |
369 | cl::Hidden); |
370 | |
371 | static cl::opt<bool> EnableHipStdPar( |
372 | "amdgpu-enable-hipstdpar" , |
373 | cl::desc("Enable HIP Standard Parallelism Offload support" ), cl::init(Val: false), |
374 | cl::Hidden); |
375 | |
376 | extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { |
377 | // Register the target |
378 | RegisterTargetMachine<R600TargetMachine> X(getTheR600Target()); |
379 | RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget()); |
380 | |
381 | PassRegistry *PR = PassRegistry::getPassRegistry(); |
382 | initializeR600ClauseMergePassPass(*PR); |
383 | initializeR600ControlFlowFinalizerPass(*PR); |
384 | initializeR600PacketizerPass(*PR); |
385 | initializeR600ExpandSpecialInstrsPassPass(*PR); |
386 | initializeR600VectorRegMergerPass(*PR); |
387 | initializeGlobalISel(*PR); |
388 | initializeAMDGPUDAGToDAGISelPass(*PR); |
389 | initializeGCNDPPCombinePass(*PR); |
390 | initializeSILowerI1CopiesPass(*PR); |
391 | initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR); |
392 | initializeSILowerWWMCopiesPass(*PR); |
393 | initializeAMDGPUMarkLastScratchLoadPass(*PR); |
394 | initializeSILowerSGPRSpillsPass(*PR); |
395 | initializeSIFixSGPRCopiesPass(*PR); |
396 | initializeSIFixVGPRCopiesPass(*PR); |
397 | initializeSIFoldOperandsPass(*PR); |
398 | initializeSIPeepholeSDWAPass(*PR); |
399 | initializeSIShrinkInstructionsPass(*PR); |
400 | initializeSIOptimizeExecMaskingPreRAPass(*PR); |
401 | initializeSIOptimizeVGPRLiveRangePass(*PR); |
402 | initializeSILoadStoreOptimizerPass(*PR); |
403 | initializeAMDGPUCtorDtorLoweringLegacyPass(*PR); |
404 | initializeAMDGPUAlwaysInlinePass(*PR); |
405 | initializeAMDGPUAttributorLegacyPass(*PR); |
406 | initializeAMDGPUAnnotateKernelFeaturesPass(*PR); |
407 | initializeAMDGPUAnnotateUniformValuesPass(*PR); |
408 | initializeAMDGPUArgumentUsageInfoPass(*PR); |
409 | initializeAMDGPUAtomicOptimizerPass(*PR); |
410 | initializeAMDGPULowerKernelArgumentsPass(*PR); |
411 | initializeAMDGPUPromoteKernelArgumentsPass(*PR); |
412 | initializeAMDGPULowerKernelAttributesPass(*PR); |
413 | initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); |
414 | initializeAMDGPUPostLegalizerCombinerPass(*PR); |
415 | initializeAMDGPUPreLegalizerCombinerPass(*PR); |
416 | initializeAMDGPURegBankCombinerPass(*PR); |
417 | initializeAMDGPURegBankSelectPass(*PR); |
418 | initializeAMDGPUPromoteAllocaPass(*PR); |
419 | initializeAMDGPUPromoteAllocaToVectorPass(*PR); |
420 | initializeAMDGPUCodeGenPreparePass(*PR); |
421 | initializeAMDGPULateCodeGenPreparePass(*PR); |
422 | initializeAMDGPURemoveIncompatibleFunctionsPass(*PR); |
423 | initializeAMDGPULowerModuleLDSLegacyPass(*PR); |
424 | initializeAMDGPULowerBufferFatPointersPass(*PR); |
425 | initializeAMDGPURewriteOutArgumentsPass(*PR); |
426 | initializeAMDGPURewriteUndefForPHILegacyPass(*PR); |
427 | initializeAMDGPUUnifyMetadataPass(*PR); |
428 | initializeSIAnnotateControlFlowPass(*PR); |
429 | initializeAMDGPUInsertSingleUseVDSTPass(*PR); |
430 | initializeAMDGPUInsertDelayAluPass(*PR); |
431 | initializeSIInsertHardClausesPass(*PR); |
432 | initializeSIInsertWaitcntsPass(*PR); |
433 | initializeSIModeRegisterPass(*PR); |
434 | initializeSIWholeQuadModePass(*PR); |
435 | initializeSILowerControlFlowPass(*PR); |
436 | initializeSIPreEmitPeepholePass(*PR); |
437 | initializeSILateBranchLoweringPass(*PR); |
438 | initializeSIMemoryLegalizerPass(*PR); |
439 | initializeSIOptimizeExecMaskingPass(*PR); |
440 | initializeSIPreAllocateWWMRegsPass(*PR); |
441 | initializeSIFormMemoryClausesPass(*PR); |
442 | initializeSIPostRABundlerPass(*PR); |
443 | initializeGCNCreateVOPDPass(*PR); |
444 | initializeAMDGPUUnifyDivergentExitNodesPass(*PR); |
445 | initializeAMDGPUAAWrapperPassPass(*PR); |
446 | initializeAMDGPUExternalAAWrapperPass(*PR); |
447 | initializeAMDGPUImageIntrinsicOptimizerPass(*PR); |
448 | initializeAMDGPUPrintfRuntimeBindingPass(*PR); |
449 | initializeAMDGPUResourceUsageAnalysisPass(*PR); |
450 | initializeGCNNSAReassignPass(*PR); |
451 | initializeGCNPreRAOptimizationsPass(*PR); |
452 | initializeGCNPreRALongBranchRegPass(*PR); |
453 | initializeGCNRewritePartialRegUsesPass(*PR); |
454 | initializeGCNRegPressurePrinterPass(*PR); |
455 | } |
456 | |
457 | static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { |
458 | return std::make_unique<AMDGPUTargetObjectFile>(); |
459 | } |
460 | |
461 | static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) { |
462 | return new SIScheduleDAGMI(C); |
463 | } |
464 | |
465 | static ScheduleDAGInstrs * |
466 | createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { |
467 | const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); |
468 | ScheduleDAGMILive *DAG = |
469 | new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(args&: C)); |
470 | DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI)); |
471 | if (ST.shouldClusterStores()) |
472 | DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI)); |
473 | DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial)); |
474 | DAG->addMutation(Mutation: createAMDGPUMacroFusionDAGMutation()); |
475 | DAG->addMutation(Mutation: createAMDGPUExportClusteringDAGMutation()); |
476 | return DAG; |
477 | } |
478 | |
479 | static ScheduleDAGInstrs * |
480 | createGCNMaxILPMachineScheduler(MachineSchedContext *C) { |
481 | ScheduleDAGMILive *DAG = |
482 | new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(args&: C)); |
483 | DAG->addMutation(Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::Initial)); |
484 | return DAG; |
485 | } |
486 | |
487 | static ScheduleDAGInstrs * |
488 | createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { |
489 | const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); |
490 | auto DAG = new GCNIterativeScheduler(C, |
491 | GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY); |
492 | DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI)); |
493 | if (ST.shouldClusterStores()) |
494 | DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI)); |
495 | return DAG; |
496 | } |
497 | |
498 | static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) { |
499 | return new GCNIterativeScheduler(C, |
500 | GCNIterativeScheduler::SCHEDULE_MINREGFORCED); |
501 | } |
502 | |
503 | static ScheduleDAGInstrs * |
504 | createIterativeILPMachineScheduler(MachineSchedContext *C) { |
505 | const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); |
506 | auto DAG = new GCNIterativeScheduler(C, |
507 | GCNIterativeScheduler::SCHEDULE_ILP); |
508 | DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI)); |
509 | if (ST.shouldClusterStores()) |
510 | DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI)); |
511 | DAG->addMutation(Mutation: createAMDGPUMacroFusionDAGMutation()); |
512 | return DAG; |
513 | } |
514 | |
515 | static MachineSchedRegistry |
516 | SISchedRegistry("si" , "Run SI's custom scheduler" , |
517 | createSIMachineScheduler); |
518 | |
519 | static MachineSchedRegistry |
520 | GCNMaxOccupancySchedRegistry("gcn-max-occupancy" , |
521 | "Run GCN scheduler to maximize occupancy" , |
522 | createGCNMaxOccupancyMachineScheduler); |
523 | |
524 | static MachineSchedRegistry |
525 | GCNMaxILPSchedRegistry("gcn-max-ilp" , "Run GCN scheduler to maximize ilp" , |
526 | createGCNMaxILPMachineScheduler); |
527 | |
528 | static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry( |
529 | "gcn-iterative-max-occupancy-experimental" , |
530 | "Run GCN scheduler to maximize occupancy (experimental)" , |
531 | createIterativeGCNMaxOccupancyMachineScheduler); |
532 | |
533 | static MachineSchedRegistry GCNMinRegSchedRegistry( |
534 | "gcn-iterative-minreg" , |
535 | "Run GCN iterative scheduler for minimal register usage (experimental)" , |
536 | createMinRegScheduler); |
537 | |
538 | static MachineSchedRegistry GCNILPSchedRegistry( |
539 | "gcn-iterative-ilp" , |
540 | "Run GCN iterative scheduler for ILP scheduling (experimental)" , |
541 | createIterativeILPMachineScheduler); |
542 | |
543 | static StringRef computeDataLayout(const Triple &TT) { |
544 | if (TT.getArch() == Triple::r600) { |
545 | // 32-bit pointers. |
546 | return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" |
547 | "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1" ; |
548 | } |
549 | |
550 | // 32-bit private, local, and region pointers. 64-bit global, constant and |
551 | // flat. 160-bit non-integral fat buffer pointers that include a 128-bit |
552 | // buffer descriptor and a 32-bit offset, which are indexed by 32-bit values |
553 | // (address space 7), and 128-bit non-integral buffer resourcees (address |
554 | // space 8) which cannot be non-trivilally accessed by LLVM memory operations |
555 | // like getelementptr. |
556 | return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32" |
557 | "-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-" |
558 | "v32:32-v48:64-v96:" |
559 | "128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-" |
560 | "G1-ni:7:8:9" ; |
561 | } |
562 | |
563 | LLVM_READNONE |
564 | static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { |
565 | if (!GPU.empty()) |
566 | return GPU; |
567 | |
568 | // Need to default to a target with flat support for HSA. |
569 | if (TT.getArch() == Triple::amdgcn) |
570 | return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic" ; |
571 | |
572 | return "r600" ; |
573 | } |
574 | |
575 | static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) { |
576 | // The AMDGPU toolchain only supports generating shared objects, so we |
577 | // must always use PIC. |
578 | return Reloc::PIC_; |
579 | } |
580 | |
581 | AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, |
582 | StringRef CPU, StringRef FS, |
583 | const TargetOptions &Options, |
584 | std::optional<Reloc::Model> RM, |
585 | std::optional<CodeModel::Model> CM, |
586 | CodeGenOptLevel OptLevel) |
587 | : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, GPU: CPU), |
588 | FS, Options, getEffectiveRelocModel(RM), |
589 | getEffectiveCodeModel(CM, Default: CodeModel::Small), OptLevel), |
590 | TLOF(createTLOF(TT: getTargetTriple())) { |
591 | initAsmInfo(); |
592 | if (TT.getArch() == Triple::amdgcn) { |
593 | if (getMCSubtargetInfo()->checkFeatures(FS: "+wavefrontsize64" )) |
594 | MRI.reset(p: llvm::createGCNMCRegisterInfo(DwarfFlavour: AMDGPUDwarfFlavour::Wave64)); |
595 | else if (getMCSubtargetInfo()->checkFeatures(FS: "+wavefrontsize32" )) |
596 | MRI.reset(p: llvm::createGCNMCRegisterInfo(DwarfFlavour: AMDGPUDwarfFlavour::Wave32)); |
597 | } |
598 | } |
599 | |
600 | bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; |
601 | bool AMDGPUTargetMachine::EnableFunctionCalls = false; |
602 | bool AMDGPUTargetMachine::EnableLowerModuleLDS = true; |
603 | bool AMDGPUTargetMachine::DisableStructurizer = false; |
604 | |
605 | AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; |
606 | |
607 | StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const { |
608 | Attribute GPUAttr = F.getFnAttribute(Kind: "target-cpu" ); |
609 | return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU(); |
610 | } |
611 | |
612 | StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const { |
613 | Attribute FSAttr = F.getFnAttribute(Kind: "target-features" ); |
614 | |
615 | return FSAttr.isValid() ? FSAttr.getValueAsString() |
616 | : getTargetFeatureString(); |
617 | } |
618 | |
619 | /// Predicate for Internalize pass. |
620 | static bool mustPreserveGV(const GlobalValue &GV) { |
621 | if (const Function *F = dyn_cast<Function>(Val: &GV)) |
622 | return F->isDeclaration() || F->getName().starts_with(Prefix: "__asan_" ) || |
623 | F->getName().starts_with(Prefix: "__sanitizer_" ) || |
624 | AMDGPU::isEntryFunctionCC(CC: F->getCallingConv()); |
625 | |
626 | GV.removeDeadConstantUsers(); |
627 | return !GV.use_empty(); |
628 | } |
629 | |
630 | void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) { |
631 | AAM.registerFunctionAnalysis<AMDGPUAA>(); |
632 | } |
633 | |
634 | static Expected<ScanOptions> |
635 | parseAMDGPUAtomicOptimizerStrategy(StringRef Params) { |
636 | if (Params.empty()) |
637 | return ScanOptions::Iterative; |
638 | Params.consume_front(Prefix: "strategy=" ); |
639 | auto Result = StringSwitch<std::optional<ScanOptions>>(Params) |
640 | .Case(S: "dpp" , Value: ScanOptions::DPP) |
641 | .Cases(S0: "iterative" , S1: "" , Value: ScanOptions::Iterative) |
642 | .Case(S: "none" , Value: ScanOptions::None) |
643 | .Default(Value: std::nullopt); |
644 | if (Result) |
645 | return *Result; |
646 | return make_error<StringError>(Args: "invalid parameter" , Args: inconvertibleErrorCode()); |
647 | } |
648 | |
649 | void AMDGPUTargetMachine::registerPassBuilderCallbacks( |
650 | PassBuilder &PB, bool PopulateClassToPassNames) { |
651 | |
652 | #define GET_PASS_REGISTRY "AMDGPUPassRegistry.def" |
653 | #include "llvm/Passes/TargetPassRegistry.inc" |
654 | |
655 | PB.registerPipelineStartEPCallback( |
656 | C: [](ModulePassManager &PM, OptimizationLevel Level) { |
657 | FunctionPassManager FPM; |
658 | PM.addPass(Pass: createModuleToFunctionPassAdaptor(Pass: std::move(FPM))); |
659 | if (EnableHipStdPar) |
660 | PM.addPass(Pass: HipStdParAcceleratorCodeSelectionPass()); |
661 | }); |
662 | |
663 | PB.registerPipelineEarlySimplificationEPCallback( |
664 | C: [](ModulePassManager &PM, OptimizationLevel Level) { |
665 | PM.addPass(Pass: AMDGPUPrintfRuntimeBindingPass()); |
666 | |
667 | if (Level == OptimizationLevel::O0) |
668 | return; |
669 | |
670 | PM.addPass(Pass: AMDGPUUnifyMetadataPass()); |
671 | |
672 | if (InternalizeSymbols) { |
673 | PM.addPass(Pass: InternalizePass(mustPreserveGV)); |
674 | PM.addPass(Pass: GlobalDCEPass()); |
675 | } |
676 | |
677 | if (EarlyInlineAll && !EnableFunctionCalls) |
678 | PM.addPass(Pass: AMDGPUAlwaysInlinePass()); |
679 | }); |
680 | |
681 | PB.registerPeepholeEPCallback( |
682 | C: [](FunctionPassManager &FPM, OptimizationLevel Level) { |
683 | if (Level == OptimizationLevel::O0) |
684 | return; |
685 | |
686 | FPM.addPass(Pass: AMDGPUUseNativeCallsPass()); |
687 | if (EnableLibCallSimplify) |
688 | FPM.addPass(Pass: AMDGPUSimplifyLibCallsPass()); |
689 | }); |
690 | |
691 | PB.registerCGSCCOptimizerLateEPCallback( |
692 | C: [this](CGSCCPassManager &PM, OptimizationLevel Level) { |
693 | if (Level == OptimizationLevel::O0) |
694 | return; |
695 | |
696 | FunctionPassManager FPM; |
697 | |
698 | // Add promote kernel arguments pass to the opt pipeline right before |
699 | // infer address spaces which is needed to do actual address space |
700 | // rewriting. |
701 | if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() && |
702 | EnablePromoteKernelArguments) |
703 | FPM.addPass(Pass: AMDGPUPromoteKernelArgumentsPass()); |
704 | |
705 | // Add infer address spaces pass to the opt pipeline after inlining |
706 | // but before SROA to increase SROA opportunities. |
707 | FPM.addPass(Pass: InferAddressSpacesPass()); |
708 | |
709 | // This should run after inlining to have any chance of doing |
710 | // anything, and before other cleanup optimizations. |
711 | FPM.addPass(Pass: AMDGPULowerKernelAttributesPass()); |
712 | |
713 | if (Level != OptimizationLevel::O0) { |
714 | // Promote alloca to vector before SROA and loop unroll. If we |
715 | // manage to eliminate allocas before unroll we may choose to unroll |
716 | // less. |
717 | FPM.addPass(Pass: AMDGPUPromoteAllocaToVectorPass(*this)); |
718 | } |
719 | |
720 | PM.addPass(Pass: createCGSCCToFunctionPassAdaptor(Pass: std::move(FPM))); |
721 | }); |
722 | |
723 | PB.registerFullLinkTimeOptimizationLastEPCallback( |
724 | C: [this](ModulePassManager &PM, OptimizationLevel Level) { |
725 | // We want to support the -lto-partitions=N option as "best effort". |
726 | // For that, we need to lower LDS earlier in the pipeline before the |
727 | // module is partitioned for codegen. |
728 | if (EnableLowerModuleLDS) |
729 | PM.addPass(Pass: AMDGPULowerModuleLDSPass(*this)); |
730 | }); |
731 | } |
732 | |
733 | int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) { |
734 | return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || |
735 | AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || |
736 | AddrSpace == AMDGPUAS::REGION_ADDRESS) |
737 | ? -1 |
738 | : 0; |
739 | } |
740 | |
741 | bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, |
742 | unsigned DestAS) const { |
743 | return AMDGPU::isFlatGlobalAddrSpace(AS: SrcAS) && |
744 | AMDGPU::isFlatGlobalAddrSpace(AS: DestAS); |
745 | } |
746 | |
747 | unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const { |
748 | const auto *LD = dyn_cast<LoadInst>(Val: V); |
749 | if (!LD) |
750 | return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; |
751 | |
752 | // It must be a generic pointer loaded. |
753 | assert(V->getType()->isPointerTy() && |
754 | V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS); |
755 | |
756 | const auto *Ptr = LD->getPointerOperand(); |
757 | if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) |
758 | return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; |
759 | // For a generic pointer loaded from the constant memory, it could be assumed |
760 | // as a global pointer since the constant memory is only populated on the |
761 | // host side. As implied by the offload programming model, only global |
762 | // pointers could be referenced on the host side. |
763 | return AMDGPUAS::GLOBAL_ADDRESS; |
764 | } |
765 | |
766 | std::pair<const Value *, unsigned> |
767 | AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const { |
768 | if (auto *II = dyn_cast<IntrinsicInst>(Val: V)) { |
769 | switch (II->getIntrinsicID()) { |
770 | case Intrinsic::amdgcn_is_shared: |
771 | return std::pair(II->getArgOperand(i: 0), AMDGPUAS::LOCAL_ADDRESS); |
772 | case Intrinsic::amdgcn_is_private: |
773 | return std::pair(II->getArgOperand(i: 0), AMDGPUAS::PRIVATE_ADDRESS); |
774 | default: |
775 | break; |
776 | } |
777 | return std::pair(nullptr, -1); |
778 | } |
779 | // Check the global pointer predication based on |
780 | // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and |
781 | // the order of 'is_shared' and 'is_private' is not significant. |
782 | Value *Ptr; |
783 | if (match( |
784 | const_cast<Value *>(V), |
785 | m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))), |
786 | m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>( |
787 | m_Deferred(Ptr)))))) |
788 | return std::pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS); |
789 | |
790 | return std::pair(nullptr, -1); |
791 | } |
792 | |
793 | unsigned |
794 | AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const { |
795 | switch (Kind) { |
796 | case PseudoSourceValue::Stack: |
797 | case PseudoSourceValue::FixedStack: |
798 | return AMDGPUAS::PRIVATE_ADDRESS; |
799 | case PseudoSourceValue::ConstantPool: |
800 | case PseudoSourceValue::GOT: |
801 | case PseudoSourceValue::JumpTable: |
802 | case PseudoSourceValue::GlobalValueCallEntry: |
803 | case PseudoSourceValue::ExternalSymbolCallEntry: |
804 | return AMDGPUAS::CONSTANT_ADDRESS; |
805 | } |
806 | return AMDGPUAS::FLAT_ADDRESS; |
807 | } |
808 | |
809 | //===----------------------------------------------------------------------===// |
810 | // GCN Target Machine (SI+) |
811 | //===----------------------------------------------------------------------===// |
812 | |
813 | GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, |
814 | StringRef CPU, StringRef FS, |
815 | const TargetOptions &Options, |
816 | std::optional<Reloc::Model> RM, |
817 | std::optional<CodeModel::Model> CM, |
818 | CodeGenOptLevel OL, bool JIT) |
819 | : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} |
820 | |
821 | const TargetSubtargetInfo * |
822 | GCNTargetMachine::getSubtargetImpl(const Function &F) const { |
823 | StringRef GPU = getGPUName(F); |
824 | StringRef FS = getFeatureString(F); |
825 | |
826 | SmallString<128> SubtargetKey(GPU); |
827 | SubtargetKey.append(RHS: FS); |
828 | |
829 | auto &I = SubtargetMap[SubtargetKey]; |
830 | if (!I) { |
831 | // This needs to be done before we create a new subtarget since any |
832 | // creation will depend on the TM and the code generation flags on the |
833 | // function that reside in TargetOptions. |
834 | resetTargetOptions(F); |
835 | I = std::make_unique<GCNSubtarget>(args: TargetTriple, args&: GPU, args&: FS, args: *this); |
836 | } |
837 | |
838 | I->setScalarizeGlobalBehavior(ScalarizeGlobal); |
839 | |
840 | return I.get(); |
841 | } |
842 | |
843 | TargetTransformInfo |
844 | GCNTargetMachine::getTargetTransformInfo(const Function &F) const { |
845 | return TargetTransformInfo(GCNTTIImpl(this, F)); |
846 | } |
847 | |
848 | //===----------------------------------------------------------------------===// |
849 | // AMDGPU Pass Setup |
850 | //===----------------------------------------------------------------------===// |
851 | |
852 | std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const { |
853 | return getStandardCSEConfigForOpt(Level: TM->getOptLevel()); |
854 | } |
855 | |
856 | namespace { |
857 | |
858 | class GCNPassConfig final : public AMDGPUPassConfig { |
859 | public: |
860 | GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) |
861 | : AMDGPUPassConfig(TM, PM) { |
862 | // It is necessary to know the register usage of the entire call graph. We |
863 | // allow calls without EnableAMDGPUFunctionCalls if they are marked |
864 | // noinline, so this is always required. |
865 | setRequiresCodeGenSCCOrder(true); |
866 | substitutePass(StandardID: &PostRASchedulerID, TargetID: &PostMachineSchedulerID); |
867 | } |
868 | |
869 | GCNTargetMachine &getGCNTargetMachine() const { |
870 | return getTM<GCNTargetMachine>(); |
871 | } |
872 | |
873 | ScheduleDAGInstrs * |
874 | createMachineScheduler(MachineSchedContext *C) const override; |
875 | |
876 | ScheduleDAGInstrs * |
877 | createPostMachineScheduler(MachineSchedContext *C) const override { |
878 | ScheduleDAGMI *DAG = new GCNPostScheduleDAGMILive( |
879 | C, std::make_unique<PostGenericScheduler>(args&: C), |
880 | /*RemoveKillFlags=*/true); |
881 | const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); |
882 | DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI)); |
883 | if (ST.shouldClusterStores()) |
884 | DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI)); |
885 | DAG->addMutation(Mutation: ST.createFillMFMAShadowMutation(DAG->TII)); |
886 | DAG->addMutation( |
887 | Mutation: createIGroupLPDAGMutation(Phase: AMDGPU::SchedulingPhase::PostRA)); |
888 | if (isPassEnabled(Opt: EnableVOPD, Level: CodeGenOptLevel::Less)) |
889 | DAG->addMutation(Mutation: createVOPDPairingMutation()); |
890 | return DAG; |
891 | } |
892 | |
893 | bool addPreISel() override; |
894 | void addMachineSSAOptimization() override; |
895 | bool addILPOpts() override; |
896 | bool addInstSelector() override; |
897 | bool addIRTranslator() override; |
898 | void addPreLegalizeMachineIR() override; |
899 | bool addLegalizeMachineIR() override; |
900 | void addPreRegBankSelect() override; |
901 | bool addRegBankSelect() override; |
902 | void addPreGlobalInstructionSelect() override; |
903 | bool addGlobalInstructionSelect() override; |
904 | void addFastRegAlloc() override; |
905 | void addOptimizedRegAlloc() override; |
906 | |
907 | FunctionPass *createSGPRAllocPass(bool Optimized); |
908 | FunctionPass *createVGPRAllocPass(bool Optimized); |
909 | FunctionPass *createRegAllocPass(bool Optimized) override; |
910 | |
911 | bool addRegAssignAndRewriteFast() override; |
912 | bool addRegAssignAndRewriteOptimized() override; |
913 | |
914 | void addPreRegAlloc() override; |
915 | bool addPreRewrite() override; |
916 | void addPostRegAlloc() override; |
917 | void addPreSched2() override; |
918 | void addPreEmitPass() override; |
919 | }; |
920 | |
921 | } // end anonymous namespace |
922 | |
923 | AMDGPUPassConfig::AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) |
924 | : TargetPassConfig(TM, PM) { |
925 | // Exceptions and StackMaps are not supported, so these passes will never do |
926 | // anything. |
927 | disablePass(PassID: &StackMapLivenessID); |
928 | disablePass(PassID: &FuncletLayoutID); |
929 | // Garbage collection is not supported. |
930 | disablePass(PassID: &GCLoweringID); |
931 | disablePass(PassID: &ShadowStackGCLoweringID); |
932 | } |
933 | |
934 | void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { |
935 | if (getOptLevel() == CodeGenOptLevel::Aggressive) |
936 | addPass(P: createGVNPass()); |
937 | else |
938 | addPass(P: createEarlyCSEPass()); |
939 | } |
940 | |
941 | void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { |
942 | if (isPassEnabled(Opt: EnableLoopPrefetch, Level: CodeGenOptLevel::Aggressive)) |
943 | addPass(P: createLoopDataPrefetchPass()); |
944 | addPass(P: createSeparateConstOffsetFromGEPPass()); |
945 | // ReassociateGEPs exposes more opportunities for SLSR. See |
946 | // the example in reassociate-geps-and-slsr.ll. |
947 | addPass(P: createStraightLineStrengthReducePass()); |
948 | // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or |
949 | // EarlyCSE can reuse. |
950 | addEarlyCSEOrGVNPass(); |
951 | // Run NaryReassociate after EarlyCSE/GVN to be more effective. |
952 | addPass(P: createNaryReassociatePass()); |
953 | // NaryReassociate on GEPs creates redundant common expressions, so run |
954 | // EarlyCSE after it. |
955 | addPass(P: createEarlyCSEPass()); |
956 | } |
957 | |
958 | void AMDGPUPassConfig::addIRPasses() { |
959 | const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); |
960 | |
961 | Triple::ArchType Arch = TM.getTargetTriple().getArch(); |
962 | if (RemoveIncompatibleFunctions && Arch == Triple::amdgcn) |
963 | addPass(P: createAMDGPURemoveIncompatibleFunctionsPass(&TM)); |
964 | |
965 | // There is no reason to run these. |
966 | disablePass(PassID: &StackMapLivenessID); |
967 | disablePass(PassID: &FuncletLayoutID); |
968 | disablePass(PassID: &PatchableFunctionID); |
969 | |
970 | addPass(P: createAMDGPUPrintfRuntimeBinding()); |
971 | if (LowerCtorDtor) |
972 | addPass(P: createAMDGPUCtorDtorLoweringLegacyPass()); |
973 | |
974 | if (isPassEnabled(Opt: EnableImageIntrinsicOptimizer)) |
975 | addPass(P: createAMDGPUImageIntrinsicOptimizerPass(&TM)); |
976 | |
977 | // Function calls are not supported, so make sure we inline everything. |
978 | addPass(P: createAMDGPUAlwaysInlinePass()); |
979 | addPass(P: createAlwaysInlinerLegacyPass()); |
980 | |
981 | // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. |
982 | if (Arch == Triple::r600) |
983 | addPass(P: createR600OpenCLImageTypeLoweringPass()); |
984 | |
985 | // Replace OpenCL enqueued block function pointers with global variables. |
986 | addPass(P: createAMDGPUOpenCLEnqueuedBlockLoweringPass()); |
987 | |
988 | // Runs before PromoteAlloca so the latter can account for function uses |
989 | if (EnableLowerModuleLDS) { |
990 | addPass(P: createAMDGPULowerModuleLDSLegacyPass(TM: &TM)); |
991 | } |
992 | |
993 | // AMDGPUAttributor infers lack of llvm.amdgcn.lds.kernel.id calls, so run |
994 | // after their introduction |
995 | if (TM.getOptLevel() > CodeGenOptLevel::None) |
996 | addPass(P: createAMDGPUAttributorLegacyPass()); |
997 | |
998 | if (TM.getOptLevel() > CodeGenOptLevel::None) |
999 | addPass(P: createInferAddressSpacesPass()); |
1000 | |
1001 | // Run atomic optimizer before Atomic Expand |
1002 | if ((TM.getTargetTriple().getArch() == Triple::amdgcn) && |
1003 | (TM.getOptLevel() >= CodeGenOptLevel::Less) && |
1004 | (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) { |
1005 | addPass(P: createAMDGPUAtomicOptimizerPass(ScanStrategy: AMDGPUAtomicOptimizerStrategy)); |
1006 | } |
1007 | |
1008 | addPass(P: createAtomicExpandLegacyPass()); |
1009 | |
1010 | if (TM.getOptLevel() > CodeGenOptLevel::None) { |
1011 | addPass(P: createAMDGPUPromoteAlloca()); |
1012 | |
1013 | if (isPassEnabled(Opt: EnableScalarIRPasses)) |
1014 | addStraightLineScalarOptimizationPasses(); |
1015 | |
1016 | if (EnableAMDGPUAliasAnalysis) { |
1017 | addPass(P: createAMDGPUAAWrapperPass()); |
1018 | addPass(P: createExternalAAWrapperPass(Callback: [](Pass &P, Function &, |
1019 | AAResults &AAR) { |
1020 | if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>()) |
1021 | AAR.addAAResult(AAResult&: WrapperPass->getResult()); |
1022 | })); |
1023 | } |
1024 | |
1025 | if (TM.getTargetTriple().getArch() == Triple::amdgcn) { |
1026 | // TODO: May want to move later or split into an early and late one. |
1027 | addPass(P: createAMDGPUCodeGenPreparePass()); |
1028 | } |
1029 | |
1030 | // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may |
1031 | // have expanded. |
1032 | if (TM.getOptLevel() > CodeGenOptLevel::Less) |
1033 | addPass(P: createLICMPass()); |
1034 | } |
1035 | |
1036 | TargetPassConfig::addIRPasses(); |
1037 | |
1038 | // EarlyCSE is not always strong enough to clean up what LSR produces. For |
1039 | // example, GVN can combine |
1040 | // |
1041 | // %0 = add %a, %b |
1042 | // %1 = add %b, %a |
1043 | // |
1044 | // and |
1045 | // |
1046 | // %0 = shl nsw %a, 2 |
1047 | // %1 = shl %a, 2 |
1048 | // |
1049 | // but EarlyCSE can do neither of them. |
1050 | if (isPassEnabled(Opt: EnableScalarIRPasses)) |
1051 | addEarlyCSEOrGVNPass(); |
1052 | } |
1053 | |
1054 | void AMDGPUPassConfig::addCodeGenPrepare() { |
1055 | if (TM->getTargetTriple().getArch() == Triple::amdgcn) { |
1056 | // FIXME: This pass adds 2 hacky attributes that can be replaced with an |
1057 | // analysis, and should be removed. |
1058 | addPass(P: createAMDGPUAnnotateKernelFeaturesPass()); |
1059 | } |
1060 | |
1061 | if (TM->getTargetTriple().getArch() == Triple::amdgcn && |
1062 | EnableLowerKernelArguments) |
1063 | addPass(P: createAMDGPULowerKernelArgumentsPass()); |
1064 | |
1065 | if (TM->getTargetTriple().getArch() == Triple::amdgcn) { |
1066 | // This lowering has been placed after codegenprepare to take advantage of |
1067 | // address mode matching (which is why it isn't put with the LDS lowerings). |
1068 | // It could be placed anywhere before uniformity annotations (an analysis |
1069 | // that it changes by splitting up fat pointers into their components) |
1070 | // but has been put before switch lowering and CFG flattening so that those |
1071 | // passes can run on the more optimized control flow this pass creates in |
1072 | // many cases. |
1073 | // |
1074 | // FIXME: This should ideally be put after the LoadStoreVectorizer. |
1075 | // However, due to some annoying facts about ResourceUsageAnalysis, |
1076 | // (especially as exercised in the resource-usage-dead-function test), |
1077 | // we need all the function passes codegenprepare all the way through |
1078 | // said resource usage analysis to run on the call graph produced |
1079 | // before codegenprepare runs (because codegenprepare will knock some |
1080 | // nodes out of the graph, which leads to function-level passes not |
1081 | // being run on them, which causes crashes in the resource usage analysis). |
1082 | addPass(P: createAMDGPULowerBufferFatPointersPass()); |
1083 | // In accordance with the above FIXME, manually force all the |
1084 | // function-level passes into a CGSCCPassManager. |
1085 | addPass(P: new DummyCGSCCPass()); |
1086 | } |
1087 | |
1088 | TargetPassConfig::addCodeGenPrepare(); |
1089 | |
1090 | if (isPassEnabled(Opt: EnableLoadStoreVectorizer)) |
1091 | addPass(P: createLoadStoreVectorizerPass()); |
1092 | |
1093 | // LowerSwitch pass may introduce unreachable blocks that can |
1094 | // cause unexpected behavior for subsequent passes. Placing it |
1095 | // here seems better that these blocks would get cleaned up by |
1096 | // UnreachableBlockElim inserted next in the pass flow. |
1097 | addPass(P: createLowerSwitchPass()); |
1098 | } |
1099 | |
1100 | bool AMDGPUPassConfig::addPreISel() { |
1101 | if (TM->getOptLevel() > CodeGenOptLevel::None) |
1102 | addPass(P: createFlattenCFGPass()); |
1103 | return false; |
1104 | } |
1105 | |
1106 | bool AMDGPUPassConfig::addInstSelector() { |
1107 | addPass(P: createAMDGPUISelDag(TM&: getAMDGPUTargetMachine(), OptLevel: getOptLevel())); |
1108 | return false; |
1109 | } |
1110 | |
1111 | bool AMDGPUPassConfig::addGCPasses() { |
1112 | // Do nothing. GC is not supported. |
1113 | return false; |
1114 | } |
1115 | |
1116 | llvm::ScheduleDAGInstrs * |
1117 | AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const { |
1118 | const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); |
1119 | ScheduleDAGMILive *DAG = createGenericSchedLive(C); |
1120 | DAG->addMutation(Mutation: createLoadClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI)); |
1121 | if (ST.shouldClusterStores()) |
1122 | DAG->addMutation(Mutation: createStoreClusterDAGMutation(TII: DAG->TII, TRI: DAG->TRI)); |
1123 | return DAG; |
1124 | } |
1125 | |
1126 | MachineFunctionInfo *R600TargetMachine::createMachineFunctionInfo( |
1127 | BumpPtrAllocator &Allocator, const Function &F, |
1128 | const TargetSubtargetInfo *STI) const { |
1129 | return R600MachineFunctionInfo::create<R600MachineFunctionInfo>( |
1130 | Allocator, F, static_cast<const R600Subtarget *>(STI)); |
1131 | } |
1132 | |
1133 | //===----------------------------------------------------------------------===// |
1134 | // GCN Pass Setup |
1135 | //===----------------------------------------------------------------------===// |
1136 | |
1137 | ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( |
1138 | MachineSchedContext *C) const { |
1139 | const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); |
1140 | if (ST.enableSIScheduler()) |
1141 | return createSIMachineScheduler(C); |
1142 | |
1143 | if (EnableMaxIlpSchedStrategy) |
1144 | return createGCNMaxILPMachineScheduler(C); |
1145 | |
1146 | return createGCNMaxOccupancyMachineScheduler(C); |
1147 | } |
1148 | |
1149 | bool GCNPassConfig::addPreISel() { |
1150 | AMDGPUPassConfig::addPreISel(); |
1151 | |
1152 | if (TM->getOptLevel() > CodeGenOptLevel::None) |
1153 | addPass(P: createAMDGPULateCodeGenPreparePass()); |
1154 | |
1155 | if (TM->getOptLevel() > CodeGenOptLevel::None) |
1156 | addPass(P: createSinkingPass()); |
1157 | |
1158 | // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit |
1159 | // regions formed by them. |
1160 | addPass(PassID: &AMDGPUUnifyDivergentExitNodesID); |
1161 | if (!LateCFGStructurize && !DisableStructurizer) { |
1162 | if (EnableStructurizerWorkarounds) { |
1163 | addPass(P: createFixIrreduciblePass()); |
1164 | addPass(P: createUnifyLoopExitsPass()); |
1165 | } |
1166 | addPass(P: createStructurizeCFGPass(SkipUniformRegions: false)); // true -> SkipUniformRegions |
1167 | } |
1168 | addPass(P: createAMDGPUAnnotateUniformValues()); |
1169 | if (!LateCFGStructurize && !DisableStructurizer) { |
1170 | addPass(P: createSIAnnotateControlFlowPass()); |
1171 | // TODO: Move this right after structurizeCFG to avoid extra divergence |
1172 | // analysis. This depends on stopping SIAnnotateControlFlow from making |
1173 | // control flow modifications. |
1174 | addPass(P: createAMDGPURewriteUndefForPHILegacyPass()); |
1175 | } |
1176 | addPass(P: createLCSSAPass()); |
1177 | |
1178 | if (TM->getOptLevel() > CodeGenOptLevel::Less) |
1179 | addPass(PassID: &AMDGPUPerfHintAnalysisID); |
1180 | |
1181 | return false; |
1182 | } |
1183 | |
1184 | void GCNPassConfig::addMachineSSAOptimization() { |
1185 | TargetPassConfig::addMachineSSAOptimization(); |
1186 | |
1187 | // We want to fold operands after PeepholeOptimizer has run (or as part of |
1188 | // it), because it will eliminate extra copies making it easier to fold the |
1189 | // real source operand. We want to eliminate dead instructions after, so that |
1190 | // we see fewer uses of the copies. We then need to clean up the dead |
1191 | // instructions leftover after the operands are folded as well. |
1192 | // |
1193 | // XXX - Can we get away without running DeadMachineInstructionElim again? |
1194 | addPass(PassID: &SIFoldOperandsID); |
1195 | if (EnableDPPCombine) |
1196 | addPass(PassID: &GCNDPPCombineID); |
1197 | addPass(PassID: &SILoadStoreOptimizerID); |
1198 | if (isPassEnabled(Opt: EnableSDWAPeephole)) { |
1199 | addPass(PassID: &SIPeepholeSDWAID); |
1200 | addPass(PassID: &EarlyMachineLICMID); |
1201 | addPass(PassID: &MachineCSEID); |
1202 | addPass(PassID: &SIFoldOperandsID); |
1203 | } |
1204 | addPass(PassID: &DeadMachineInstructionElimID); |
1205 | addPass(P: createSIShrinkInstructionsPass()); |
1206 | } |
1207 | |
1208 | bool GCNPassConfig::addILPOpts() { |
1209 | if (EnableEarlyIfConversion) |
1210 | addPass(PassID: &EarlyIfConverterID); |
1211 | |
1212 | TargetPassConfig::addILPOpts(); |
1213 | return false; |
1214 | } |
1215 | |
1216 | bool GCNPassConfig::addInstSelector() { |
1217 | AMDGPUPassConfig::addInstSelector(); |
1218 | addPass(PassID: &SIFixSGPRCopiesID); |
1219 | addPass(P: createSILowerI1CopiesPass()); |
1220 | return false; |
1221 | } |
1222 | |
1223 | bool GCNPassConfig::addIRTranslator() { |
1224 | addPass(P: new IRTranslator(getOptLevel())); |
1225 | return false; |
1226 | } |
1227 | |
1228 | void GCNPassConfig::addPreLegalizeMachineIR() { |
1229 | bool IsOptNone = getOptLevel() == CodeGenOptLevel::None; |
1230 | addPass(P: createAMDGPUPreLegalizeCombiner(IsOptNone)); |
1231 | addPass(P: new Localizer()); |
1232 | } |
1233 | |
1234 | bool GCNPassConfig::addLegalizeMachineIR() { |
1235 | addPass(P: new Legalizer()); |
1236 | return false; |
1237 | } |
1238 | |
1239 | void GCNPassConfig::addPreRegBankSelect() { |
1240 | bool IsOptNone = getOptLevel() == CodeGenOptLevel::None; |
1241 | addPass(P: createAMDGPUPostLegalizeCombiner(IsOptNone)); |
1242 | addPass(P: createAMDGPUGlobalISelDivergenceLoweringPass()); |
1243 | } |
1244 | |
1245 | bool GCNPassConfig::addRegBankSelect() { |
1246 | addPass(P: new AMDGPURegBankSelect()); |
1247 | return false; |
1248 | } |
1249 | |
1250 | void GCNPassConfig::addPreGlobalInstructionSelect() { |
1251 | bool IsOptNone = getOptLevel() == CodeGenOptLevel::None; |
1252 | addPass(P: createAMDGPURegBankCombiner(IsOptNone)); |
1253 | } |
1254 | |
1255 | bool GCNPassConfig::addGlobalInstructionSelect() { |
1256 | addPass(P: new InstructionSelect(getOptLevel())); |
1257 | return false; |
1258 | } |
1259 | |
1260 | void GCNPassConfig::addPreRegAlloc() { |
1261 | if (LateCFGStructurize) { |
1262 | addPass(P: createAMDGPUMachineCFGStructurizerPass()); |
1263 | } |
1264 | } |
1265 | |
1266 | void GCNPassConfig::addFastRegAlloc() { |
1267 | // FIXME: We have to disable the verifier here because of PHIElimination + |
1268 | // TwoAddressInstructions disabling it. |
1269 | |
1270 | // This must be run immediately after phi elimination and before |
1271 | // TwoAddressInstructions, otherwise the processing of the tied operand of |
1272 | // SI_ELSE will introduce a copy of the tied operand source after the else. |
1273 | insertPass(TargetPassID: &PHIEliminationID, InsertedPassID: &SILowerControlFlowID); |
1274 | |
1275 | insertPass(TargetPassID: &TwoAddressInstructionPassID, InsertedPassID: &SIWholeQuadModeID); |
1276 | |
1277 | TargetPassConfig::addFastRegAlloc(); |
1278 | } |
1279 | |
1280 | void GCNPassConfig::addOptimizedRegAlloc() { |
1281 | // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation |
1282 | // instructions that cause scheduling barriers. |
1283 | insertPass(TargetPassID: &MachineSchedulerID, InsertedPassID: &SIWholeQuadModeID); |
1284 | |
1285 | if (OptExecMaskPreRA) |
1286 | insertPass(TargetPassID: &MachineSchedulerID, InsertedPassID: &SIOptimizeExecMaskingPreRAID); |
1287 | |
1288 | if (EnableRewritePartialRegUses) |
1289 | insertPass(TargetPassID: &RenameIndependentSubregsID, InsertedPassID: &GCNRewritePartialRegUsesID); |
1290 | |
1291 | if (isPassEnabled(Opt: EnablePreRAOptimizations)) |
1292 | insertPass(TargetPassID: &RenameIndependentSubregsID, InsertedPassID: &GCNPreRAOptimizationsID); |
1293 | |
1294 | // This is not an essential optimization and it has a noticeable impact on |
1295 | // compilation time, so we only enable it from O2. |
1296 | if (TM->getOptLevel() > CodeGenOptLevel::Less) |
1297 | insertPass(TargetPassID: &MachineSchedulerID, InsertedPassID: &SIFormMemoryClausesID); |
1298 | |
1299 | // FIXME: when an instruction has a Killed operand, and the instruction is |
1300 | // inside a bundle, seems only the BUNDLE instruction appears as the Kills of |
1301 | // the register in LiveVariables, this would trigger a failure in verifier, |
1302 | // we should fix it and enable the verifier. |
1303 | if (OptVGPRLiveRange) |
1304 | insertPass(TargetPassID: &LiveVariablesID, InsertedPassID: &SIOptimizeVGPRLiveRangeID); |
1305 | // This must be run immediately after phi elimination and before |
1306 | // TwoAddressInstructions, otherwise the processing of the tied operand of |
1307 | // SI_ELSE will introduce a copy of the tied operand source after the else. |
1308 | insertPass(TargetPassID: &PHIEliminationID, InsertedPassID: &SILowerControlFlowID); |
1309 | |
1310 | if (EnableDCEInRA) |
1311 | insertPass(TargetPassID: &DetectDeadLanesID, InsertedPassID: &DeadMachineInstructionElimID); |
1312 | |
1313 | TargetPassConfig::addOptimizedRegAlloc(); |
1314 | } |
1315 | |
1316 | bool GCNPassConfig::addPreRewrite() { |
1317 | addPass(PassID: &SILowerWWMCopiesID); |
1318 | if (EnableRegReassign) |
1319 | addPass(PassID: &GCNNSAReassignID); |
1320 | return true; |
1321 | } |
1322 | |
1323 | FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) { |
1324 | // Initialize the global default. |
1325 | llvm::call_once(flag&: InitializeDefaultSGPRRegisterAllocatorFlag, |
1326 | F&: initializeDefaultSGPRRegisterAllocatorOnce); |
1327 | |
1328 | RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); |
1329 | if (Ctor != useDefaultRegisterAllocator) |
1330 | return Ctor(); |
1331 | |
1332 | if (Optimized) |
1333 | return createGreedyRegisterAllocator(F: onlyAllocateSGPRs); |
1334 | |
1335 | return createFastRegisterAllocator(F: onlyAllocateSGPRs, ClearVirtRegs: false); |
1336 | } |
1337 | |
1338 | FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) { |
1339 | // Initialize the global default. |
1340 | llvm::call_once(flag&: InitializeDefaultVGPRRegisterAllocatorFlag, |
1341 | F&: initializeDefaultVGPRRegisterAllocatorOnce); |
1342 | |
1343 | RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); |
1344 | if (Ctor != useDefaultRegisterAllocator) |
1345 | return Ctor(); |
1346 | |
1347 | if (Optimized) |
1348 | return createGreedyVGPRRegisterAllocator(); |
1349 | |
1350 | return createFastVGPRRegisterAllocator(); |
1351 | } |
1352 | |
1353 | FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) { |
1354 | llvm_unreachable("should not be used" ); |
1355 | } |
1356 | |
1357 | static const char RegAllocOptNotSupportedMessage[] = |
1358 | "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc" ; |
1359 | |
1360 | bool GCNPassConfig::addRegAssignAndRewriteFast() { |
1361 | if (!usingDefaultRegAlloc()) |
1362 | report_fatal_error(reason: RegAllocOptNotSupportedMessage); |
1363 | |
1364 | addPass(PassID: &GCNPreRALongBranchRegID); |
1365 | |
1366 | addPass(P: createSGPRAllocPass(Optimized: false)); |
1367 | |
1368 | // Equivalent of PEI for SGPRs. |
1369 | addPass(PassID: &SILowerSGPRSpillsID); |
1370 | addPass(PassID: &SIPreAllocateWWMRegsID); |
1371 | |
1372 | addPass(P: createVGPRAllocPass(Optimized: false)); |
1373 | |
1374 | addPass(PassID: &SILowerWWMCopiesID); |
1375 | return true; |
1376 | } |
1377 | |
1378 | bool GCNPassConfig::addRegAssignAndRewriteOptimized() { |
1379 | if (!usingDefaultRegAlloc()) |
1380 | report_fatal_error(reason: RegAllocOptNotSupportedMessage); |
1381 | |
1382 | addPass(PassID: &GCNPreRALongBranchRegID); |
1383 | |
1384 | addPass(P: createSGPRAllocPass(Optimized: true)); |
1385 | |
1386 | // Commit allocated register changes. This is mostly necessary because too |
1387 | // many things rely on the use lists of the physical registers, such as the |
1388 | // verifier. This is only necessary with allocators which use LiveIntervals, |
1389 | // since FastRegAlloc does the replacements itself. |
1390 | addPass(P: createVirtRegRewriter(ClearVirtRegs: false)); |
1391 | |
1392 | // Equivalent of PEI for SGPRs. |
1393 | addPass(PassID: &SILowerSGPRSpillsID); |
1394 | addPass(PassID: &SIPreAllocateWWMRegsID); |
1395 | |
1396 | addPass(P: createVGPRAllocPass(Optimized: true)); |
1397 | |
1398 | addPreRewrite(); |
1399 | addPass(PassID: &VirtRegRewriterID); |
1400 | |
1401 | addPass(PassID: &AMDGPUMarkLastScratchLoadID); |
1402 | |
1403 | return true; |
1404 | } |
1405 | |
1406 | void GCNPassConfig::addPostRegAlloc() { |
1407 | addPass(PassID: &SIFixVGPRCopiesID); |
1408 | if (getOptLevel() > CodeGenOptLevel::None) |
1409 | addPass(PassID: &SIOptimizeExecMaskingID); |
1410 | TargetPassConfig::addPostRegAlloc(); |
1411 | } |
1412 | |
1413 | void GCNPassConfig::addPreSched2() { |
1414 | if (TM->getOptLevel() > CodeGenOptLevel::None) |
1415 | addPass(P: createSIShrinkInstructionsPass()); |
1416 | addPass(PassID: &SIPostRABundlerID); |
1417 | } |
1418 | |
1419 | void GCNPassConfig::addPreEmitPass() { |
1420 | if (isPassEnabled(Opt: EnableVOPD, Level: CodeGenOptLevel::Less)) |
1421 | addPass(PassID: &GCNCreateVOPDID); |
1422 | addPass(P: createSIMemoryLegalizerPass()); |
1423 | addPass(P: createSIInsertWaitcntsPass()); |
1424 | |
1425 | addPass(P: createSIModeRegisterPass()); |
1426 | |
1427 | if (getOptLevel() > CodeGenOptLevel::None) |
1428 | addPass(PassID: &SIInsertHardClausesID); |
1429 | |
1430 | addPass(PassID: &SILateBranchLoweringPassID); |
1431 | if (isPassEnabled(Opt: EnableSetWavePriority, Level: CodeGenOptLevel::Less)) |
1432 | addPass(P: createAMDGPUSetWavePriorityPass()); |
1433 | if (getOptLevel() > CodeGenOptLevel::None) |
1434 | addPass(PassID: &SIPreEmitPeepholeID); |
1435 | // The hazard recognizer that runs as part of the post-ra scheduler does not |
1436 | // guarantee to be able handle all hazards correctly. This is because if there |
1437 | // are multiple scheduling regions in a basic block, the regions are scheduled |
1438 | // bottom up, so when we begin to schedule a region we don't know what |
1439 | // instructions were emitted directly before it. |
1440 | // |
1441 | // Here we add a stand-alone hazard recognizer pass which can handle all |
1442 | // cases. |
1443 | addPass(PassID: &PostRAHazardRecognizerID); |
1444 | |
1445 | if (isPassEnabled(Opt: EnableInsertSingleUseVDST, Level: CodeGenOptLevel::Less)) |
1446 | addPass(PassID: &AMDGPUInsertSingleUseVDSTID); |
1447 | |
1448 | if (isPassEnabled(Opt: EnableInsertDelayAlu, Level: CodeGenOptLevel::Less)) |
1449 | addPass(PassID: &AMDGPUInsertDelayAluID); |
1450 | |
1451 | addPass(PassID: &BranchRelaxationPassID); |
1452 | } |
1453 | |
1454 | TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { |
1455 | return new GCNPassConfig(*this, PM); |
1456 | } |
1457 | |
1458 | void GCNTargetMachine::registerMachineRegisterInfoCallback( |
1459 | MachineFunction &MF) const { |
1460 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
1461 | MF.getRegInfo().addDelegate(delegate: MFI); |
1462 | } |
1463 | |
1464 | MachineFunctionInfo *GCNTargetMachine::createMachineFunctionInfo( |
1465 | BumpPtrAllocator &Allocator, const Function &F, |
1466 | const TargetSubtargetInfo *STI) const { |
1467 | return SIMachineFunctionInfo::create<SIMachineFunctionInfo>( |
1468 | Allocator, F, static_cast<const GCNSubtarget *>(STI)); |
1469 | } |
1470 | |
1471 | yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const { |
1472 | return new yaml::SIMachineFunctionInfo(); |
1473 | } |
1474 | |
1475 | yaml::MachineFunctionInfo * |
1476 | GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const { |
1477 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
1478 | return new yaml::SIMachineFunctionInfo( |
1479 | *MFI, *MF.getSubtarget<GCNSubtarget>().getRegisterInfo(), MF); |
1480 | } |
1481 | |
1482 | bool GCNTargetMachine::parseMachineFunctionInfo( |
1483 | const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS, |
1484 | SMDiagnostic &Error, SMRange &SourceRange) const { |
1485 | const yaml::SIMachineFunctionInfo &YamlMFI = |
1486 | static_cast<const yaml::SIMachineFunctionInfo &>(MFI_); |
1487 | MachineFunction &MF = PFS.MF; |
1488 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
1489 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1490 | |
1491 | if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange)) |
1492 | return true; |
1493 | |
1494 | if (MFI->Occupancy == 0) { |
1495 | // Fixup the subtarget dependent default value. |
1496 | MFI->Occupancy = ST.computeOccupancy(F: MF.getFunction(), LDSSize: MFI->getLDSSize()); |
1497 | } |
1498 | |
1499 | auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) { |
1500 | Register TempReg; |
1501 | if (parseNamedRegisterReference(PFS, Reg&: TempReg, Src: RegName.Value, Error)) { |
1502 | SourceRange = RegName.SourceRange; |
1503 | return true; |
1504 | } |
1505 | RegVal = TempReg; |
1506 | |
1507 | return false; |
1508 | }; |
1509 | |
1510 | auto parseOptionalRegister = [&](const yaml::StringValue &RegName, |
1511 | Register &RegVal) { |
1512 | return !RegName.Value.empty() && parseRegister(RegName, RegVal); |
1513 | }; |
1514 | |
1515 | if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy)) |
1516 | return true; |
1517 | |
1518 | if (parseOptionalRegister(YamlMFI.SGPRForEXECCopy, MFI->SGPRForEXECCopy)) |
1519 | return true; |
1520 | |
1521 | if (parseOptionalRegister(YamlMFI.LongBranchReservedReg, |
1522 | MFI->LongBranchReservedReg)) |
1523 | return true; |
1524 | |
1525 | auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) { |
1526 | // Create a diagnostic for a the register string literal. |
1527 | const MemoryBuffer &Buffer = |
1528 | *PFS.SM->getMemoryBuffer(i: PFS.SM->getMainFileID()); |
1529 | Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1, |
1530 | RegName.Value.size(), SourceMgr::DK_Error, |
1531 | "incorrect register class for field" , RegName.Value, |
1532 | std::nullopt, std::nullopt); |
1533 | SourceRange = RegName.SourceRange; |
1534 | return true; |
1535 | }; |
1536 | |
1537 | if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) || |
1538 | parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) || |
1539 | parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg)) |
1540 | return true; |
1541 | |
1542 | if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG && |
1543 | !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) { |
1544 | return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg); |
1545 | } |
1546 | |
1547 | if (MFI->FrameOffsetReg != AMDGPU::FP_REG && |
1548 | !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) { |
1549 | return diagnoseRegisterClass(YamlMFI.FrameOffsetReg); |
1550 | } |
1551 | |
1552 | if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG && |
1553 | !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) { |
1554 | return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg); |
1555 | } |
1556 | |
1557 | for (const auto &YamlReg : YamlMFI.WWMReservedRegs) { |
1558 | Register ParsedReg; |
1559 | if (parseRegister(YamlReg, ParsedReg)) |
1560 | return true; |
1561 | |
1562 | MFI->reserveWWMRegister(Reg: ParsedReg); |
1563 | } |
1564 | |
1565 | auto parseAndCheckArgument = [&](const std::optional<yaml::SIArgument> &A, |
1566 | const TargetRegisterClass &RC, |
1567 | ArgDescriptor &Arg, unsigned UserSGPRs, |
1568 | unsigned SystemSGPRs) { |
1569 | // Skip parsing if it's not present. |
1570 | if (!A) |
1571 | return false; |
1572 | |
1573 | if (A->IsRegister) { |
1574 | Register Reg; |
1575 | if (parseNamedRegisterReference(PFS, Reg, Src: A->RegisterName.Value, Error)) { |
1576 | SourceRange = A->RegisterName.SourceRange; |
1577 | return true; |
1578 | } |
1579 | if (!RC.contains(Reg)) |
1580 | return diagnoseRegisterClass(A->RegisterName); |
1581 | Arg = ArgDescriptor::createRegister(Reg); |
1582 | } else |
1583 | Arg = ArgDescriptor::createStack(Offset: A->StackOffset); |
1584 | // Check and apply the optional mask. |
1585 | if (A->Mask) |
1586 | Arg = ArgDescriptor::createArg(Arg, Mask: *A->Mask); |
1587 | |
1588 | MFI->NumUserSGPRs += UserSGPRs; |
1589 | MFI->NumSystemSGPRs += SystemSGPRs; |
1590 | return false; |
1591 | }; |
1592 | |
1593 | if (YamlMFI.ArgInfo && |
1594 | (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer, |
1595 | AMDGPU::SGPR_128RegClass, |
1596 | MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) || |
1597 | parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr, |
1598 | AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr, |
1599 | 2, 0) || |
1600 | parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass, |
1601 | MFI->ArgInfo.QueuePtr, 2, 0) || |
1602 | parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr, |
1603 | AMDGPU::SReg_64RegClass, |
1604 | MFI->ArgInfo.KernargSegmentPtr, 2, 0) || |
1605 | parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID, |
1606 | AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID, |
1607 | 2, 0) || |
1608 | parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit, |
1609 | AMDGPU::SReg_64RegClass, |
1610 | MFI->ArgInfo.FlatScratchInit, 2, 0) || |
1611 | parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize, |
1612 | AMDGPU::SGPR_32RegClass, |
1613 | MFI->ArgInfo.PrivateSegmentSize, 0, 0) || |
1614 | parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId, |
1615 | AMDGPU::SGPR_32RegClass, |
1616 | MFI->ArgInfo.LDSKernelId, 0, 1) || |
1617 | parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX, |
1618 | AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX, |
1619 | 0, 1) || |
1620 | parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY, |
1621 | AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY, |
1622 | 0, 1) || |
1623 | parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ, |
1624 | AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ, |
1625 | 0, 1) || |
1626 | parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo, |
1627 | AMDGPU::SGPR_32RegClass, |
1628 | MFI->ArgInfo.WorkGroupInfo, 0, 1) || |
1629 | parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset, |
1630 | AMDGPU::SGPR_32RegClass, |
1631 | MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) || |
1632 | parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr, |
1633 | AMDGPU::SReg_64RegClass, |
1634 | MFI->ArgInfo.ImplicitArgPtr, 0, 0) || |
1635 | parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr, |
1636 | AMDGPU::SReg_64RegClass, |
1637 | MFI->ArgInfo.ImplicitBufferPtr, 2, 0) || |
1638 | parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX, |
1639 | AMDGPU::VGPR_32RegClass, |
1640 | MFI->ArgInfo.WorkItemIDX, 0, 0) || |
1641 | parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY, |
1642 | AMDGPU::VGPR_32RegClass, |
1643 | MFI->ArgInfo.WorkItemIDY, 0, 0) || |
1644 | parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ, |
1645 | AMDGPU::VGPR_32RegClass, |
1646 | MFI->ArgInfo.WorkItemIDZ, 0, 0))) |
1647 | return true; |
1648 | |
1649 | if (ST.hasIEEEMode()) |
1650 | MFI->Mode.IEEE = YamlMFI.Mode.IEEE; |
1651 | if (ST.hasDX10ClampMode()) |
1652 | MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp; |
1653 | |
1654 | // FIXME: Move proper support for denormal-fp-math into base MachineFunction |
1655 | MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals |
1656 | ? DenormalMode::IEEE |
1657 | : DenormalMode::PreserveSign; |
1658 | MFI->Mode.FP32Denormals.Output = YamlMFI.Mode.FP32OutputDenormals |
1659 | ? DenormalMode::IEEE |
1660 | : DenormalMode::PreserveSign; |
1661 | |
1662 | MFI->Mode.FP64FP16Denormals.Input = YamlMFI.Mode.FP64FP16InputDenormals |
1663 | ? DenormalMode::IEEE |
1664 | : DenormalMode::PreserveSign; |
1665 | MFI->Mode.FP64FP16Denormals.Output = YamlMFI.Mode.FP64FP16OutputDenormals |
1666 | ? DenormalMode::IEEE |
1667 | : DenormalMode::PreserveSign; |
1668 | |
1669 | return false; |
1670 | } |
1671 | |