1 | //===-- AMDGPULowerModuleLDSPass.cpp ------------------------------*- C++ -*-=// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This pass eliminates local data store, LDS, uses from non-kernel functions. |
10 | // LDS is contiguous memory allocated per kernel execution. |
11 | // |
12 | // Background. |
13 | // |
14 | // The programming model is global variables, or equivalently function local |
15 | // static variables, accessible from kernels or other functions. For uses from |
16 | // kernels this is straightforward - assign an integer to the kernel for the |
17 | // memory required by all the variables combined, allocate them within that. |
18 | // For uses from functions there are performance tradeoffs to choose between. |
19 | // |
20 | // This model means the GPU runtime can specify the amount of memory allocated. |
21 | // If this is more than the kernel assumed, the excess can be made available |
22 | // using a language specific feature, which IR represents as a variable with |
23 | // no initializer. This feature is referred to here as "Dynamic LDS" and is |
24 | // lowered slightly differently to the normal case. |
25 | // |
26 | // Consequences of this GPU feature: |
27 | // - memory is limited and exceeding it halts compilation |
28 | // - a global accessed by one kernel exists independent of other kernels |
29 | // - a global exists independent of simultaneous execution of the same kernel |
30 | // - the address of the global may be different from different kernels as they |
31 | // do not alias, which permits only allocating variables they use |
32 | // - if the address is allowed to differ, functions need help to find it |
33 | // |
34 | // Uses from kernels are implemented here by grouping them in a per-kernel |
35 | // struct instance. This duplicates the variables, accurately modelling their |
36 | // aliasing properties relative to a single global representation. It also |
37 | // permits control over alignment via padding. |
38 | // |
39 | // Uses from functions are more complicated and the primary purpose of this |
40 | // IR pass. Several different lowering are chosen between to meet requirements |
41 | // to avoid allocating any LDS where it is not necessary, as that impacts |
42 | // occupancy and may fail the compilation, while not imposing overhead on a |
43 | // feature whose primary advantage over global memory is performance. The basic |
44 | // design goal is to avoid one kernel imposing overhead on another. |
45 | // |
46 | // Implementation. |
47 | // |
48 | // LDS variables with constant annotation or non-undef initializer are passed |
49 | // through unchanged for simplification or error diagnostics in later passes. |
50 | // Non-undef initializers are not yet implemented for LDS. |
51 | // |
52 | // LDS variables that are always allocated at the same address can be found |
53 | // by lookup at that address. Otherwise runtime information/cost is required. |
54 | // |
55 | // The simplest strategy possible is to group all LDS variables in a single |
56 | // struct and allocate that struct in every kernel such that the original |
57 | // variables are always at the same address. LDS is however a limited resource |
58 | // so this strategy is unusable in practice. It is not implemented here. |
59 | // |
60 | // Strategy | Precise allocation | Zero runtime cost | General purpose | |
61 | // --------+--------------------+-------------------+-----------------+ |
62 | // Module | No | Yes | Yes | |
63 | // Table | Yes | No | Yes | |
64 | // Kernel | Yes | Yes | No | |
65 | // Hybrid | Yes | Partial | Yes | |
66 | // |
67 | // "Module" spends LDS memory to save cycles. "Table" spends cycles and global |
68 | // memory to save LDS. "Kernel" is as fast as kernel allocation but only works |
69 | // for variables that are known reachable from a single kernel. "Hybrid" picks |
70 | // between all three. When forced to choose between LDS and cycles we minimise |
71 | // LDS use. |
72 | |
73 | // The "module" lowering implemented here finds LDS variables which are used by |
74 | // non-kernel functions and creates a new struct with a field for each of those |
75 | // LDS variables. Variables that are only used from kernels are excluded. |
76 | // |
77 | // The "table" lowering implemented here has three components. |
78 | // First kernels are assigned a unique integer identifier which is available in |
79 | // functions it calls through the intrinsic amdgcn_lds_kernel_id. The integer |
80 | // is passed through a specific SGPR, thus works with indirect calls. |
81 | // Second, each kernel allocates LDS variables independent of other kernels and |
82 | // writes the addresses it chose for each variable into an array in consistent |
83 | // order. If the kernel does not allocate a given variable, it writes undef to |
84 | // the corresponding array location. These arrays are written to a constant |
85 | // table in the order matching the kernel unique integer identifier. |
86 | // Third, uses from non-kernel functions are replaced with a table lookup using |
87 | // the intrinsic function to find the address of the variable. |
88 | // |
89 | // "Kernel" lowering is only applicable for variables that are unambiguously |
90 | // reachable from exactly one kernel. For those cases, accesses to the variable |
91 | // can be lowered to ConstantExpr address of a struct instance specific to that |
92 | // one kernel. This is zero cost in space and in compute. It will raise a fatal |
93 | // error on any variable that might be reachable from multiple kernels and is |
94 | // thus most easily used as part of the hybrid lowering strategy. |
95 | // |
96 | // Hybrid lowering is a mixture of the above. It uses the zero cost kernel |
97 | // lowering where it can. It lowers the variable accessed by the greatest |
98 | // number of kernels using the module strategy as that is free for the first |
99 | // variable. Any futher variables that can be lowered with the module strategy |
100 | // without incurring LDS memory overhead are. The remaining ones are lowered |
101 | // via table. |
102 | // |
103 | // Consequences |
104 | // - No heuristics or user controlled magic numbers, hybrid is the right choice |
105 | // - Kernels that don't use functions (or have had them all inlined) are not |
106 | // affected by any lowering for kernels that do. |
107 | // - Kernels that don't make indirect function calls are not affected by those |
108 | // that do. |
109 | // - Variables which are used by lots of kernels, e.g. those injected by a |
110 | // language runtime in most kernels, are expected to have no overhead |
111 | // - Implementations that instantiate templates per-kernel where those templates |
112 | // use LDS are expected to hit the "Kernel" lowering strategy |
113 | // - The runtime properties impose a cost in compiler implementation complexity |
114 | // |
115 | // Dynamic LDS implementation |
116 | // Dynamic LDS is lowered similarly to the "table" strategy above and uses the |
117 | // same intrinsic to identify which kernel is at the root of the dynamic call |
118 | // graph. This relies on the specified behaviour that all dynamic LDS variables |
119 | // alias one another, i.e. are at the same address, with respect to a given |
120 | // kernel. Therefore this pass creates new dynamic LDS variables for each kernel |
121 | // that allocates any dynamic LDS and builds a table of addresses out of those. |
122 | // The AMDGPUPromoteAlloca pass skips kernels that use dynamic LDS. |
123 | // The corresponding optimisation for "kernel" lowering where the table lookup |
124 | // is elided is not implemented. |
125 | // |
126 | // |
127 | // Implementation notes / limitations |
128 | // A single LDS global variable represents an instance per kernel that can reach |
129 | // said variables. This pass essentially specialises said variables per kernel. |
130 | // Handling ConstantExpr during the pass complicated this significantly so now |
131 | // all ConstantExpr uses of LDS variables are expanded to instructions. This |
132 | // may need amending when implementing non-undef initialisers. |
133 | // |
134 | // Lowering is split between this IR pass and the back end. This pass chooses |
135 | // where given variables should be allocated and marks them with metadata, |
136 | // MD_absolute_symbol. The backend places the variables in coincidentally the |
137 | // same location and raises a fatal error if something has gone awry. This works |
138 | // in practice because the only pass between this one and the backend that |
139 | // changes LDS is PromoteAlloca and the changes it makes do not conflict. |
140 | // |
141 | // Addresses are written to constant global arrays based on the same metadata. |
142 | // |
143 | // The backend lowers LDS variables in the order of traversal of the function. |
144 | // This is at odds with the deterministic layout required. The workaround is to |
145 | // allocate the fixed-address variables immediately upon starting the function |
146 | // where they can be placed as intended. This requires a means of mapping from |
147 | // the function to the variables that it allocates. For the module scope lds, |
148 | // this is via metadata indicating whether the variable is not required. If a |
149 | // pass deletes that metadata, a fatal error on disagreement with the absolute |
150 | // symbol metadata will occur. For kernel scope and dynamic, this is by _name_ |
151 | // correspondence between the function and the variable. It requires the |
152 | // kernel to have a name (which is only a limitation for tests in practice) and |
153 | // for nothing to rename the corresponding symbols. This is a hazard if the pass |
154 | // is run multiple times during debugging. Alternative schemes considered all |
155 | // involve bespoke metadata. |
156 | // |
157 | // If the name correspondence can be replaced, multiple distinct kernels that |
158 | // have the same memory layout can map to the same kernel id (as the address |
159 | // itself is handled by the absolute symbol metadata) and that will allow more |
160 | // uses of the "kernel" style faster lowering and reduce the size of the lookup |
161 | // tables. |
162 | // |
163 | // There is a test that checks this does not fire for a graphics shader. This |
164 | // lowering is expected to work for graphics if the isKernel test is changed. |
165 | // |
166 | // The current markUsedByKernel is sufficient for PromoteAlloca but is elided |
167 | // before codegen. Replacing this with an equivalent intrinsic which lasts until |
168 | // shortly after the machine function lowering of LDS would help break the name |
169 | // mapping. The other part needed is probably to amend PromoteAlloca to embed |
170 | // the LDS variables it creates in the same struct created here. That avoids the |
171 | // current hazard where a PromoteAlloca LDS variable might be allocated before |
172 | // the kernel scope (and thus error on the address check). Given a new invariant |
173 | // that no LDS variables exist outside of the structs managed here, and an |
174 | // intrinsic that lasts until after the LDS frame lowering, it should be |
175 | // possible to drop the name mapping and fold equivalent memory layouts. |
176 | // |
177 | //===----------------------------------------------------------------------===// |
178 | |
179 | #include "AMDGPU.h" |
180 | #include "AMDGPUTargetMachine.h" |
181 | #include "Utils/AMDGPUBaseInfo.h" |
182 | #include "Utils/AMDGPUMemoryUtils.h" |
183 | #include "llvm/ADT/BitVector.h" |
184 | #include "llvm/ADT/DenseMap.h" |
185 | #include "llvm/ADT/DenseSet.h" |
186 | #include "llvm/ADT/STLExtras.h" |
187 | #include "llvm/ADT/SetOperations.h" |
188 | #include "llvm/Analysis/CallGraph.h" |
189 | #include "llvm/CodeGen/TargetPassConfig.h" |
190 | #include "llvm/IR/Constants.h" |
191 | #include "llvm/IR/DerivedTypes.h" |
192 | #include "llvm/IR/IRBuilder.h" |
193 | #include "llvm/IR/InlineAsm.h" |
194 | #include "llvm/IR/Instructions.h" |
195 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
196 | #include "llvm/IR/MDBuilder.h" |
197 | #include "llvm/IR/ReplaceConstant.h" |
198 | #include "llvm/InitializePasses.h" |
199 | #include "llvm/Pass.h" |
200 | #include "llvm/Support/CommandLine.h" |
201 | #include "llvm/Support/Debug.h" |
202 | #include "llvm/Support/Format.h" |
203 | #include "llvm/Support/OptimizedStructLayout.h" |
204 | #include "llvm/Support/raw_ostream.h" |
205 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
206 | #include "llvm/Transforms/Utils/ModuleUtils.h" |
207 | |
208 | #include <vector> |
209 | |
210 | #include <cstdio> |
211 | |
212 | #define DEBUG_TYPE "amdgpu-lower-module-lds" |
213 | |
214 | using namespace llvm; |
215 | |
216 | namespace { |
217 | |
218 | cl::opt<bool> SuperAlignLDSGlobals( |
219 | "amdgpu-super-align-lds-globals" , |
220 | cl::desc("Increase alignment of LDS if it is not on align boundary" ), |
221 | cl::init(Val: true), cl::Hidden); |
222 | |
223 | enum class LoweringKind { module, table, kernel, hybrid }; |
224 | cl::opt<LoweringKind> LoweringKindLoc( |
225 | "amdgpu-lower-module-lds-strategy" , |
226 | cl::desc("Specify lowering strategy for function LDS access:" ), cl::Hidden, |
227 | cl::init(Val: LoweringKind::hybrid), |
228 | cl::values( |
229 | clEnumValN(LoweringKind::table, "table" , "Lower via table lookup" ), |
230 | clEnumValN(LoweringKind::module, "module" , "Lower via module struct" ), |
231 | clEnumValN( |
232 | LoweringKind::kernel, "kernel" , |
233 | "Lower variables reachable from one kernel, otherwise abort" ), |
234 | clEnumValN(LoweringKind::hybrid, "hybrid" , |
235 | "Lower via mixture of above strategies" ))); |
236 | |
237 | bool isKernelLDS(const Function *F) { |
238 | // Some weirdness here. AMDGPU::isKernelCC does not call into |
239 | // AMDGPU::isKernel with the calling conv, it instead calls into |
240 | // isModuleEntryFunction which returns true for more calling conventions |
241 | // than AMDGPU::isKernel does. There's a FIXME on AMDGPU::isKernel. |
242 | // There's also a test that checks that the LDS lowering does not hit on |
243 | // a graphics shader, denoted amdgpu_ps, so stay with the limited case. |
244 | // Putting LDS in the name of the function to draw attention to this. |
245 | return AMDGPU::isKernel(CC: F->getCallingConv()); |
246 | } |
247 | |
248 | template <typename T> std::vector<T> sortByName(std::vector<T> &&V) { |
249 | llvm::sort(V.begin(), V.end(), [](const auto *L, const auto *R) { |
250 | return L->getName() < R->getName(); |
251 | }); |
252 | return {std::move(V)}; |
253 | } |
254 | |
255 | class AMDGPULowerModuleLDS { |
256 | const AMDGPUTargetMachine &TM; |
257 | |
258 | static void |
259 | removeLocalVarsFromUsedLists(Module &M, |
260 | const DenseSet<GlobalVariable *> &LocalVars) { |
261 | // The verifier rejects used lists containing an inttoptr of a constant |
262 | // so remove the variables from these lists before replaceAllUsesWith |
263 | SmallPtrSet<Constant *, 8> ; |
264 | for (GlobalVariable *LocalVar : LocalVars) |
265 | LocalVarsSet.insert(Ptr: cast<Constant>(Val: LocalVar->stripPointerCasts())); |
266 | |
267 | removeFromUsedLists( |
268 | M, ShouldRemove: [&LocalVarsSet](Constant *C) { return LocalVarsSet.count(Ptr: C); }); |
269 | |
270 | for (GlobalVariable *LocalVar : LocalVars) |
271 | LocalVar->removeDeadConstantUsers(); |
272 | } |
273 | |
274 | static void markUsedByKernel(Function *Func, GlobalVariable *SGV) { |
275 | // The llvm.amdgcn.module.lds instance is implicitly used by all kernels |
276 | // that might call a function which accesses a field within it. This is |
277 | // presently approximated to 'all kernels' if there are any such functions |
278 | // in the module. This implicit use is redefined as an explicit use here so |
279 | // that later passes, specifically PromoteAlloca, account for the required |
280 | // memory without any knowledge of this transform. |
281 | |
282 | // An operand bundle on llvm.donothing works because the call instruction |
283 | // survives until after the last pass that needs to account for LDS. It is |
284 | // better than inline asm as the latter survives until the end of codegen. A |
285 | // totally robust solution would be a function with the same semantics as |
286 | // llvm.donothing that takes a pointer to the instance and is lowered to a |
287 | // no-op after LDS is allocated, but that is not presently necessary. |
288 | |
289 | // This intrinsic is eliminated shortly before instruction selection. It |
290 | // does not suffice to indicate to ISel that a given global which is not |
291 | // immediately used by the kernel must still be allocated by it. An |
292 | // equivalent target specific intrinsic which lasts until immediately after |
293 | // codegen would suffice for that, but one would still need to ensure that |
294 | // the variables are allocated in the anticpated order. |
295 | BasicBlock *Entry = &Func->getEntryBlock(); |
296 | IRBuilder<> Builder(Entry, Entry->getFirstNonPHIIt()); |
297 | |
298 | Function *Decl = |
299 | Intrinsic::getDeclaration(M: Func->getParent(), Intrinsic::id: donothing, Tys: {}); |
300 | |
301 | Value *UseInstance[1] = { |
302 | Builder.CreateConstInBoundsGEP1_32(Ty: SGV->getValueType(), Ptr: SGV, Idx0: 0)}; |
303 | |
304 | Builder.CreateCall( |
305 | Callee: Decl, Args: {}, OpBundles: {OperandBundleDefT<Value *>("ExplicitUse" , UseInstance)}); |
306 | } |
307 | |
308 | static bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M) { |
309 | // Constants are uniqued within LLVM. A ConstantExpr referring to a LDS |
310 | // global may have uses from multiple different functions as a result. |
311 | // This pass specialises LDS variables with respect to the kernel that |
312 | // allocates them. |
313 | |
314 | // This is semantically equivalent to (the unimplemented as slow): |
315 | // for (auto &F : M.functions()) |
316 | // for (auto &BB : F) |
317 | // for (auto &I : BB) |
318 | // for (Use &Op : I.operands()) |
319 | // if (constantExprUsesLDS(Op)) |
320 | // replaceConstantExprInFunction(I, Op); |
321 | |
322 | SmallVector<Constant *> LDSGlobals; |
323 | for (auto &GV : M.globals()) |
324 | if (AMDGPU::isLDSVariableToLower(GV)) |
325 | LDSGlobals.push_back(Elt: &GV); |
326 | |
327 | return convertUsersOfConstantsToInstructions(Consts: LDSGlobals); |
328 | } |
329 | |
330 | public: |
331 | AMDGPULowerModuleLDS(const AMDGPUTargetMachine &TM_) : TM(TM_) {} |
332 | |
333 | using FunctionVariableMap = DenseMap<Function *, DenseSet<GlobalVariable *>>; |
334 | |
335 | using VariableFunctionMap = DenseMap<GlobalVariable *, DenseSet<Function *>>; |
336 | |
337 | static void getUsesOfLDSByFunction(CallGraph const &CG, Module &M, |
338 | FunctionVariableMap &kernels, |
339 | FunctionVariableMap &functions) { |
340 | |
341 | // Get uses from the current function, excluding uses by called functions |
342 | // Two output variables to avoid walking the globals list twice |
343 | for (auto &GV : M.globals()) { |
344 | if (!AMDGPU::isLDSVariableToLower(GV)) { |
345 | continue; |
346 | } |
347 | |
348 | for (User *V : GV.users()) { |
349 | if (auto *I = dyn_cast<Instruction>(Val: V)) { |
350 | Function *F = I->getFunction(); |
351 | if (isKernelLDS(F)) { |
352 | kernels[F].insert(V: &GV); |
353 | } else { |
354 | functions[F].insert(V: &GV); |
355 | } |
356 | } |
357 | } |
358 | } |
359 | } |
360 | |
361 | struct LDSUsesInfoTy { |
362 | FunctionVariableMap direct_access; |
363 | FunctionVariableMap indirect_access; |
364 | }; |
365 | |
366 | static LDSUsesInfoTy getTransitiveUsesOfLDS(CallGraph const &CG, Module &M) { |
367 | |
368 | FunctionVariableMap direct_map_kernel; |
369 | FunctionVariableMap direct_map_function; |
370 | getUsesOfLDSByFunction(CG, M, kernels&: direct_map_kernel, functions&: direct_map_function); |
371 | |
372 | // Collect variables that are used by functions whose address has escaped |
373 | DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer; |
374 | for (Function &F : M.functions()) { |
375 | if (!isKernelLDS(F: &F)) |
376 | if (F.hasAddressTaken(nullptr, |
377 | /* IgnoreCallbackUses */ false, |
378 | /* IgnoreAssumeLikeCalls */ false, |
379 | /* IgnoreLLVMUsed */ IngoreLLVMUsed: true, |
380 | /* IgnoreArcAttachedCall */ IgnoreARCAttachedCall: false)) { |
381 | set_union(S1&: VariablesReachableThroughFunctionPointer, |
382 | S2: direct_map_function[&F]); |
383 | } |
384 | } |
385 | |
386 | auto functionMakesUnknownCall = [&](const Function *F) -> bool { |
387 | assert(!F->isDeclaration()); |
388 | for (const CallGraphNode::CallRecord &R : *CG[F]) { |
389 | if (!R.second->getFunction()) { |
390 | return true; |
391 | } |
392 | } |
393 | return false; |
394 | }; |
395 | |
396 | // Work out which variables are reachable through function calls |
397 | FunctionVariableMap transitive_map_function = direct_map_function; |
398 | |
399 | // If the function makes any unknown call, assume the worst case that it can |
400 | // access all variables accessed by functions whose address escaped |
401 | for (Function &F : M.functions()) { |
402 | if (!F.isDeclaration() && functionMakesUnknownCall(&F)) { |
403 | if (!isKernelLDS(F: &F)) { |
404 | set_union(S1&: transitive_map_function[&F], |
405 | S2: VariablesReachableThroughFunctionPointer); |
406 | } |
407 | } |
408 | } |
409 | |
410 | // Direct implementation of collecting all variables reachable from each |
411 | // function |
412 | for (Function &Func : M.functions()) { |
413 | if (Func.isDeclaration() || isKernelLDS(F: &Func)) |
414 | continue; |
415 | |
416 | DenseSet<Function *> seen; // catches cycles |
417 | SmallVector<Function *, 4> wip{&Func}; |
418 | |
419 | while (!wip.empty()) { |
420 | Function *F = wip.pop_back_val(); |
421 | |
422 | // Can accelerate this by referring to transitive map for functions that |
423 | // have already been computed, with more care than this |
424 | set_union(S1&: transitive_map_function[&Func], S2: direct_map_function[F]); |
425 | |
426 | for (const CallGraphNode::CallRecord &R : *CG[F]) { |
427 | Function *ith = R.second->getFunction(); |
428 | if (ith) { |
429 | if (!seen.contains(V: ith)) { |
430 | seen.insert(V: ith); |
431 | wip.push_back(Elt: ith); |
432 | } |
433 | } |
434 | } |
435 | } |
436 | } |
437 | |
438 | // direct_map_kernel lists which variables are used by the kernel |
439 | // find the variables which are used through a function call |
440 | FunctionVariableMap indirect_map_kernel; |
441 | |
442 | for (Function &Func : M.functions()) { |
443 | if (Func.isDeclaration() || !isKernelLDS(F: &Func)) |
444 | continue; |
445 | |
446 | for (const CallGraphNode::CallRecord &R : *CG[&Func]) { |
447 | Function *ith = R.second->getFunction(); |
448 | if (ith) { |
449 | set_union(S1&: indirect_map_kernel[&Func], S2: transitive_map_function[ith]); |
450 | } else { |
451 | set_union(S1&: indirect_map_kernel[&Func], |
452 | S2: VariablesReachableThroughFunctionPointer); |
453 | } |
454 | } |
455 | } |
456 | |
457 | // Verify that we fall into one of 2 cases: |
458 | // - All variables are absolute: this is a re-run of the pass |
459 | // so we don't have anything to do. |
460 | // - No variables are absolute. |
461 | std::optional<bool> HasAbsoluteGVs; |
462 | for (auto &Map : {direct_map_kernel, indirect_map_kernel}) { |
463 | for (auto &[Fn, GVs] : Map) { |
464 | for (auto *GV : GVs) { |
465 | bool IsAbsolute = GV->isAbsoluteSymbolRef(); |
466 | if (HasAbsoluteGVs.has_value()) { |
467 | if (*HasAbsoluteGVs != IsAbsolute) { |
468 | report_fatal_error( |
469 | reason: "Module cannot mix absolute and non-absolute LDS GVs" ); |
470 | } |
471 | } else |
472 | HasAbsoluteGVs = IsAbsolute; |
473 | } |
474 | } |
475 | } |
476 | |
477 | // If we only had absolute GVs, we have nothing to do, return an empty |
478 | // result. |
479 | if (HasAbsoluteGVs && *HasAbsoluteGVs) |
480 | return {.direct_access: FunctionVariableMap(), .indirect_access: FunctionVariableMap()}; |
481 | |
482 | return {.direct_access: std::move(direct_map_kernel), .indirect_access: std::move(indirect_map_kernel)}; |
483 | } |
484 | |
485 | struct LDSVariableReplacement { |
486 | GlobalVariable *SGV = nullptr; |
487 | DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP; |
488 | }; |
489 | |
490 | // remap from lds global to a constantexpr gep to where it has been moved to |
491 | // for each kernel |
492 | // an array with an element for each kernel containing where the corresponding |
493 | // variable was remapped to |
494 | |
495 | static Constant *getAddressesOfVariablesInKernel( |
496 | LLVMContext &Ctx, ArrayRef<GlobalVariable *> Variables, |
497 | const DenseMap<GlobalVariable *, Constant *> &LDSVarsToConstantGEP) { |
498 | // Create a ConstantArray containing the address of each Variable within the |
499 | // kernel corresponding to LDSVarsToConstantGEP, or poison if that kernel |
500 | // does not allocate it |
501 | // TODO: Drop the ptrtoint conversion |
502 | |
503 | Type *I32 = Type::getInt32Ty(C&: Ctx); |
504 | |
505 | ArrayType *KernelOffsetsType = ArrayType::get(ElementType: I32, NumElements: Variables.size()); |
506 | |
507 | SmallVector<Constant *> Elements; |
508 | for (size_t i = 0; i < Variables.size(); i++) { |
509 | GlobalVariable *GV = Variables[i]; |
510 | auto ConstantGepIt = LDSVarsToConstantGEP.find(Val: GV); |
511 | if (ConstantGepIt != LDSVarsToConstantGEP.end()) { |
512 | auto elt = ConstantExpr::getPtrToInt(C: ConstantGepIt->second, Ty: I32); |
513 | Elements.push_back(Elt: elt); |
514 | } else { |
515 | Elements.push_back(Elt: PoisonValue::get(T: I32)); |
516 | } |
517 | } |
518 | return ConstantArray::get(T: KernelOffsetsType, V: Elements); |
519 | } |
520 | |
521 | static GlobalVariable *buildLookupTable( |
522 | Module &M, ArrayRef<GlobalVariable *> Variables, |
523 | ArrayRef<Function *> kernels, |
524 | DenseMap<Function *, LDSVariableReplacement> &KernelToReplacement) { |
525 | if (Variables.empty()) { |
526 | return nullptr; |
527 | } |
528 | LLVMContext &Ctx = M.getContext(); |
529 | |
530 | const size_t NumberVariables = Variables.size(); |
531 | const size_t NumberKernels = kernels.size(); |
532 | |
533 | ArrayType *KernelOffsetsType = |
534 | ArrayType::get(ElementType: Type::getInt32Ty(C&: Ctx), NumElements: NumberVariables); |
535 | |
536 | ArrayType *AllKernelsOffsetsType = |
537 | ArrayType::get(ElementType: KernelOffsetsType, NumElements: NumberKernels); |
538 | |
539 | Constant *Missing = PoisonValue::get(T: KernelOffsetsType); |
540 | std::vector<Constant *> overallConstantExprElts(NumberKernels); |
541 | for (size_t i = 0; i < NumberKernels; i++) { |
542 | auto Replacement = KernelToReplacement.find(Val: kernels[i]); |
543 | overallConstantExprElts[i] = |
544 | (Replacement == KernelToReplacement.end()) |
545 | ? Missing |
546 | : getAddressesOfVariablesInKernel( |
547 | Ctx, Variables, LDSVarsToConstantGEP: Replacement->second.LDSVarsToConstantGEP); |
548 | } |
549 | |
550 | Constant *init = |
551 | ConstantArray::get(T: AllKernelsOffsetsType, V: overallConstantExprElts); |
552 | |
553 | return new GlobalVariable( |
554 | M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, init, |
555 | "llvm.amdgcn.lds.offset.table" , nullptr, GlobalValue::NotThreadLocal, |
556 | AMDGPUAS::CONSTANT_ADDRESS); |
557 | } |
558 | |
559 | void replaceUseWithTableLookup(Module &M, IRBuilder<> &Builder, |
560 | GlobalVariable *LookupTable, |
561 | GlobalVariable *GV, Use &U, |
562 | Value *OptionalIndex) { |
563 | // Table is a constant array of the same length as OrderedKernels |
564 | LLVMContext &Ctx = M.getContext(); |
565 | Type *I32 = Type::getInt32Ty(C&: Ctx); |
566 | auto *I = cast<Instruction>(Val: U.getUser()); |
567 | |
568 | Value *tableKernelIndex = getTableLookupKernelIndex(M, F: I->getFunction()); |
569 | |
570 | if (auto *Phi = dyn_cast<PHINode>(Val: I)) { |
571 | BasicBlock *BB = Phi->getIncomingBlock(U); |
572 | Builder.SetInsertPoint(&(*(BB->getFirstInsertionPt()))); |
573 | } else { |
574 | Builder.SetInsertPoint(I); |
575 | } |
576 | |
577 | SmallVector<Value *, 3> GEPIdx = { |
578 | ConstantInt::get(Ty: I32, V: 0), |
579 | tableKernelIndex, |
580 | }; |
581 | if (OptionalIndex) |
582 | GEPIdx.push_back(Elt: OptionalIndex); |
583 | |
584 | Value *Address = Builder.CreateInBoundsGEP( |
585 | Ty: LookupTable->getValueType(), Ptr: LookupTable, IdxList: GEPIdx, Name: GV->getName()); |
586 | |
587 | Value *loaded = Builder.CreateLoad(Ty: I32, Ptr: Address); |
588 | |
589 | Value *replacement = |
590 | Builder.CreateIntToPtr(V: loaded, DestTy: GV->getType(), Name: GV->getName()); |
591 | |
592 | U.set(replacement); |
593 | } |
594 | |
595 | void replaceUsesInInstructionsWithTableLookup( |
596 | Module &M, ArrayRef<GlobalVariable *> ModuleScopeVariables, |
597 | GlobalVariable *LookupTable) { |
598 | |
599 | LLVMContext &Ctx = M.getContext(); |
600 | IRBuilder<> Builder(Ctx); |
601 | Type *I32 = Type::getInt32Ty(C&: Ctx); |
602 | |
603 | for (size_t Index = 0; Index < ModuleScopeVariables.size(); Index++) { |
604 | auto *GV = ModuleScopeVariables[Index]; |
605 | |
606 | for (Use &U : make_early_inc_range(Range: GV->uses())) { |
607 | auto *I = dyn_cast<Instruction>(Val: U.getUser()); |
608 | if (!I) |
609 | continue; |
610 | |
611 | replaceUseWithTableLookup(M, Builder, LookupTable, GV, U, |
612 | OptionalIndex: ConstantInt::get(Ty: I32, V: Index)); |
613 | } |
614 | } |
615 | } |
616 | |
617 | static DenseSet<Function *> kernelsThatIndirectlyAccessAnyOfPassedVariables( |
618 | Module &M, LDSUsesInfoTy &LDSUsesInfo, |
619 | DenseSet<GlobalVariable *> const &VariableSet) { |
620 | |
621 | DenseSet<Function *> KernelSet; |
622 | |
623 | if (VariableSet.empty()) |
624 | return KernelSet; |
625 | |
626 | for (Function &Func : M.functions()) { |
627 | if (Func.isDeclaration() || !isKernelLDS(F: &Func)) |
628 | continue; |
629 | for (GlobalVariable *GV : LDSUsesInfo.indirect_access[&Func]) { |
630 | if (VariableSet.contains(V: GV)) { |
631 | KernelSet.insert(V: &Func); |
632 | break; |
633 | } |
634 | } |
635 | } |
636 | |
637 | return KernelSet; |
638 | } |
639 | |
640 | static GlobalVariable * |
641 | chooseBestVariableForModuleStrategy(const DataLayout &DL, |
642 | VariableFunctionMap &LDSVars) { |
643 | // Find the global variable with the most indirect uses from kernels |
644 | |
645 | struct CandidateTy { |
646 | GlobalVariable *GV = nullptr; |
647 | size_t UserCount = 0; |
648 | size_t Size = 0; |
649 | |
650 | CandidateTy() = default; |
651 | |
652 | CandidateTy(GlobalVariable *GV, uint64_t UserCount, uint64_t AllocSize) |
653 | : GV(GV), UserCount(UserCount), Size(AllocSize) {} |
654 | |
655 | bool operator<(const CandidateTy &Other) const { |
656 | // Fewer users makes module scope variable less attractive |
657 | if (UserCount < Other.UserCount) { |
658 | return true; |
659 | } |
660 | if (UserCount > Other.UserCount) { |
661 | return false; |
662 | } |
663 | |
664 | // Bigger makes module scope variable less attractive |
665 | if (Size < Other.Size) { |
666 | return false; |
667 | } |
668 | |
669 | if (Size > Other.Size) { |
670 | return true; |
671 | } |
672 | |
673 | // Arbitrary but consistent |
674 | return GV->getName() < Other.GV->getName(); |
675 | } |
676 | }; |
677 | |
678 | CandidateTy MostUsed; |
679 | |
680 | for (auto &K : LDSVars) { |
681 | GlobalVariable *GV = K.first; |
682 | if (K.second.size() <= 1) { |
683 | // A variable reachable by only one kernel is best lowered with kernel |
684 | // strategy |
685 | continue; |
686 | } |
687 | CandidateTy Candidate( |
688 | GV, K.second.size(), |
689 | DL.getTypeAllocSize(Ty: GV->getValueType()).getFixedValue()); |
690 | if (MostUsed < Candidate) |
691 | MostUsed = Candidate; |
692 | } |
693 | |
694 | return MostUsed.GV; |
695 | } |
696 | |
697 | static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV, |
698 | uint32_t Address) { |
699 | // Write the specified address into metadata where it can be retrieved by |
700 | // the assembler. Format is a half open range, [Address Address+1) |
701 | LLVMContext &Ctx = M->getContext(); |
702 | auto *IntTy = |
703 | M->getDataLayout().getIntPtrType(C&: Ctx, AddressSpace: AMDGPUAS::LOCAL_ADDRESS); |
704 | auto *MinC = ConstantAsMetadata::get(C: ConstantInt::get(Ty: IntTy, V: Address)); |
705 | auto *MaxC = ConstantAsMetadata::get(C: ConstantInt::get(Ty: IntTy, V: Address + 1)); |
706 | GV->setMetadata(KindID: LLVMContext::MD_absolute_symbol, |
707 | Node: MDNode::get(Context&: Ctx, MDs: {MinC, MaxC})); |
708 | } |
709 | |
710 | DenseMap<Function *, Value *> tableKernelIndexCache; |
711 | Value *getTableLookupKernelIndex(Module &M, Function *F) { |
712 | // Accesses from a function use the amdgcn_lds_kernel_id intrinsic which |
713 | // lowers to a read from a live in register. Emit it once in the entry |
714 | // block to spare deduplicating it later. |
715 | auto [It, Inserted] = tableKernelIndexCache.try_emplace(Key: F); |
716 | if (Inserted) { |
717 | Function *Decl = |
718 | Intrinsic::getDeclaration(M: &M, Intrinsic::id: amdgcn_lds_kernel_id, Tys: {}); |
719 | |
720 | auto InsertAt = F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca(); |
721 | IRBuilder<> Builder(&*InsertAt); |
722 | |
723 | It->second = Builder.CreateCall(Callee: Decl, Args: {}); |
724 | } |
725 | |
726 | return It->second; |
727 | } |
728 | |
729 | static std::vector<Function *> assignLDSKernelIDToEachKernel( |
730 | Module *M, DenseSet<Function *> const &KernelsThatAllocateTableLDS, |
731 | DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS) { |
732 | // Associate kernels in the set with an arbirary but reproducible order and |
733 | // annotate them with that order in metadata. This metadata is recognised by |
734 | // the backend and lowered to a SGPR which can be read from using |
735 | // amdgcn_lds_kernel_id. |
736 | |
737 | std::vector<Function *> OrderedKernels; |
738 | if (!KernelsThatAllocateTableLDS.empty() || |
739 | !KernelsThatIndirectlyAllocateDynamicLDS.empty()) { |
740 | |
741 | for (Function &Func : M->functions()) { |
742 | if (Func.isDeclaration()) |
743 | continue; |
744 | if (!isKernelLDS(F: &Func)) |
745 | continue; |
746 | |
747 | if (KernelsThatAllocateTableLDS.contains(V: &Func) || |
748 | KernelsThatIndirectlyAllocateDynamicLDS.contains(V: &Func)) { |
749 | assert(Func.hasName()); // else fatal error earlier |
750 | OrderedKernels.push_back(x: &Func); |
751 | } |
752 | } |
753 | |
754 | // Put them in an arbitrary but reproducible order |
755 | OrderedKernels = sortByName(V: std::move(OrderedKernels)); |
756 | |
757 | // Annotate the kernels with their order in this vector |
758 | LLVMContext &Ctx = M->getContext(); |
759 | IRBuilder<> Builder(Ctx); |
760 | |
761 | if (OrderedKernels.size() > UINT32_MAX) { |
762 | // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU |
763 | report_fatal_error(reason: "Unimplemented LDS lowering for > 2**32 kernels" ); |
764 | } |
765 | |
766 | for (size_t i = 0; i < OrderedKernels.size(); i++) { |
767 | Metadata *AttrMDArgs[1] = { |
768 | ConstantAsMetadata::get(C: Builder.getInt32(C: i)), |
769 | }; |
770 | OrderedKernels[i]->setMetadata(Kind: "llvm.amdgcn.lds.kernel.id" , |
771 | Node: MDNode::get(Context&: Ctx, MDs: AttrMDArgs)); |
772 | } |
773 | } |
774 | return OrderedKernels; |
775 | } |
776 | |
777 | static void partitionVariablesIntoIndirectStrategies( |
778 | Module &M, LDSUsesInfoTy const &LDSUsesInfo, |
779 | VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly, |
780 | DenseSet<GlobalVariable *> &ModuleScopeVariables, |
781 | DenseSet<GlobalVariable *> &TableLookupVariables, |
782 | DenseSet<GlobalVariable *> &KernelAccessVariables, |
783 | DenseSet<GlobalVariable *> &DynamicVariables) { |
784 | |
785 | GlobalVariable *HybridModuleRoot = |
786 | LoweringKindLoc != LoweringKind::hybrid |
787 | ? nullptr |
788 | : chooseBestVariableForModuleStrategy( |
789 | DL: M.getDataLayout(), LDSVars&: LDSToKernelsThatNeedToAccessItIndirectly); |
790 | |
791 | DenseSet<Function *> const EmptySet; |
792 | DenseSet<Function *> const &HybridModuleRootKernels = |
793 | HybridModuleRoot |
794 | ? LDSToKernelsThatNeedToAccessItIndirectly[HybridModuleRoot] |
795 | : EmptySet; |
796 | |
797 | for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) { |
798 | // Each iteration of this loop assigns exactly one global variable to |
799 | // exactly one of the implementation strategies. |
800 | |
801 | GlobalVariable *GV = K.first; |
802 | assert(AMDGPU::isLDSVariableToLower(*GV)); |
803 | assert(K.second.size() != 0); |
804 | |
805 | if (AMDGPU::isDynamicLDS(GV: *GV)) { |
806 | DynamicVariables.insert(V: GV); |
807 | continue; |
808 | } |
809 | |
810 | switch (LoweringKindLoc) { |
811 | case LoweringKind::module: |
812 | ModuleScopeVariables.insert(V: GV); |
813 | break; |
814 | |
815 | case LoweringKind::table: |
816 | TableLookupVariables.insert(V: GV); |
817 | break; |
818 | |
819 | case LoweringKind::kernel: |
820 | if (K.second.size() == 1) { |
821 | KernelAccessVariables.insert(V: GV); |
822 | } else { |
823 | report_fatal_error( |
824 | reason: "cannot lower LDS '" + GV->getName() + |
825 | "' to kernel access as it is reachable from multiple kernels" ); |
826 | } |
827 | break; |
828 | |
829 | case LoweringKind::hybrid: { |
830 | if (GV == HybridModuleRoot) { |
831 | assert(K.second.size() != 1); |
832 | ModuleScopeVariables.insert(V: GV); |
833 | } else if (K.second.size() == 1) { |
834 | KernelAccessVariables.insert(V: GV); |
835 | } else if (set_is_subset(S1: K.second, S2: HybridModuleRootKernels)) { |
836 | ModuleScopeVariables.insert(V: GV); |
837 | } else { |
838 | TableLookupVariables.insert(V: GV); |
839 | } |
840 | break; |
841 | } |
842 | } |
843 | } |
844 | |
845 | // All LDS variables accessed indirectly have now been partitioned into |
846 | // the distinct lowering strategies. |
847 | assert(ModuleScopeVariables.size() + TableLookupVariables.size() + |
848 | KernelAccessVariables.size() + DynamicVariables.size() == |
849 | LDSToKernelsThatNeedToAccessItIndirectly.size()); |
850 | } |
851 | |
852 | static GlobalVariable *lowerModuleScopeStructVariables( |
853 | Module &M, DenseSet<GlobalVariable *> const &ModuleScopeVariables, |
854 | DenseSet<Function *> const &KernelsThatAllocateModuleLDS) { |
855 | // Create a struct to hold the ModuleScopeVariables |
856 | // Replace all uses of those variables from non-kernel functions with the |
857 | // new struct instance Replace only the uses from kernel functions that will |
858 | // allocate this instance. That is a space optimisation - kernels that use a |
859 | // subset of the module scope struct and do not need to allocate it for |
860 | // indirect calls will only allocate the subset they use (they do so as part |
861 | // of the per-kernel lowering). |
862 | if (ModuleScopeVariables.empty()) { |
863 | return nullptr; |
864 | } |
865 | |
866 | LLVMContext &Ctx = M.getContext(); |
867 | |
868 | LDSVariableReplacement ModuleScopeReplacement = |
869 | createLDSVariableReplacement(M, VarName: "llvm.amdgcn.module.lds" , |
870 | LDSVarsToTransform: ModuleScopeVariables); |
871 | |
872 | appendToCompilerUsed(M, Values: {static_cast<GlobalValue *>( |
873 | ConstantExpr::getPointerBitCastOrAddrSpaceCast( |
874 | C: cast<Constant>(Val: ModuleScopeReplacement.SGV), |
875 | Ty: PointerType::getUnqual(C&: Ctx)))}); |
876 | |
877 | // module.lds will be allocated at zero in any kernel that allocates it |
878 | recordLDSAbsoluteAddress(M: &M, GV: ModuleScopeReplacement.SGV, Address: 0); |
879 | |
880 | // historic |
881 | removeLocalVarsFromUsedLists(M, LocalVars: ModuleScopeVariables); |
882 | |
883 | // Replace all uses of module scope variable from non-kernel functions |
884 | replaceLDSVariablesWithStruct( |
885 | M, LDSVarsToTransformArg: ModuleScopeVariables, Replacement: ModuleScopeReplacement, Predicate: [&](Use &U) { |
886 | Instruction *I = dyn_cast<Instruction>(Val: U.getUser()); |
887 | if (!I) { |
888 | return false; |
889 | } |
890 | Function *F = I->getFunction(); |
891 | return !isKernelLDS(F); |
892 | }); |
893 | |
894 | // Replace uses of module scope variable from kernel functions that |
895 | // allocate the module scope variable, otherwise leave them unchanged |
896 | // Record on each kernel whether the module scope global is used by it |
897 | |
898 | for (Function &Func : M.functions()) { |
899 | if (Func.isDeclaration() || !isKernelLDS(F: &Func)) |
900 | continue; |
901 | |
902 | if (KernelsThatAllocateModuleLDS.contains(V: &Func)) { |
903 | replaceLDSVariablesWithStruct( |
904 | M, LDSVarsToTransformArg: ModuleScopeVariables, Replacement: ModuleScopeReplacement, Predicate: [&](Use &U) { |
905 | Instruction *I = dyn_cast<Instruction>(Val: U.getUser()); |
906 | if (!I) { |
907 | return false; |
908 | } |
909 | Function *F = I->getFunction(); |
910 | return F == &Func; |
911 | }); |
912 | |
913 | markUsedByKernel(Func: &Func, SGV: ModuleScopeReplacement.SGV); |
914 | } |
915 | } |
916 | |
917 | return ModuleScopeReplacement.SGV; |
918 | } |
919 | |
920 | static DenseMap<Function *, LDSVariableReplacement> |
921 | lowerKernelScopeStructVariables( |
922 | Module &M, LDSUsesInfoTy &LDSUsesInfo, |
923 | DenseSet<GlobalVariable *> const &ModuleScopeVariables, |
924 | DenseSet<Function *> const &KernelsThatAllocateModuleLDS, |
925 | GlobalVariable *MaybeModuleScopeStruct) { |
926 | |
927 | // Create a struct for each kernel for the non-module-scope variables. |
928 | |
929 | DenseMap<Function *, LDSVariableReplacement> KernelToReplacement; |
930 | for (Function &Func : M.functions()) { |
931 | if (Func.isDeclaration() || !isKernelLDS(F: &Func)) |
932 | continue; |
933 | |
934 | DenseSet<GlobalVariable *> KernelUsedVariables; |
935 | // Allocating variables that are used directly in this struct to get |
936 | // alignment aware allocation and predictable frame size. |
937 | for (auto &v : LDSUsesInfo.direct_access[&Func]) { |
938 | if (!AMDGPU::isDynamicLDS(GV: *v)) { |
939 | KernelUsedVariables.insert(V: v); |
940 | } |
941 | } |
942 | |
943 | // Allocating variables that are accessed indirectly so that a lookup of |
944 | // this struct instance can find them from nested functions. |
945 | for (auto &v : LDSUsesInfo.indirect_access[&Func]) { |
946 | if (!AMDGPU::isDynamicLDS(GV: *v)) { |
947 | KernelUsedVariables.insert(V: v); |
948 | } |
949 | } |
950 | |
951 | // Variables allocated in module lds must all resolve to that struct, |
952 | // not to the per-kernel instance. |
953 | if (KernelsThatAllocateModuleLDS.contains(V: &Func)) { |
954 | for (GlobalVariable *v : ModuleScopeVariables) { |
955 | KernelUsedVariables.erase(V: v); |
956 | } |
957 | } |
958 | |
959 | if (KernelUsedVariables.empty()) { |
960 | // Either used no LDS, or the LDS it used was all in the module struct |
961 | // or dynamically sized |
962 | continue; |
963 | } |
964 | |
965 | // The association between kernel function and LDS struct is done by |
966 | // symbol name, which only works if the function in question has a |
967 | // name This is not expected to be a problem in practice as kernels |
968 | // are called by name making anonymous ones (which are named by the |
969 | // backend) difficult to use. This does mean that llvm test cases need |
970 | // to name the kernels. |
971 | if (!Func.hasName()) { |
972 | report_fatal_error(reason: "Anonymous kernels cannot use LDS variables" ); |
973 | } |
974 | |
975 | std::string VarName = |
976 | (Twine("llvm.amdgcn.kernel." ) + Func.getName() + ".lds" ).str(); |
977 | |
978 | auto Replacement = |
979 | createLDSVariableReplacement(M, VarName, LDSVarsToTransform: KernelUsedVariables); |
980 | |
981 | // If any indirect uses, create a direct use to ensure allocation |
982 | // TODO: Simpler to unconditionally mark used but that regresses |
983 | // codegen in test/CodeGen/AMDGPU/noclobber-barrier.ll |
984 | auto Accesses = LDSUsesInfo.indirect_access.find(Val: &Func); |
985 | if ((Accesses != LDSUsesInfo.indirect_access.end()) && |
986 | !Accesses->second.empty()) |
987 | markUsedByKernel(Func: &Func, SGV: Replacement.SGV); |
988 | |
989 | // remove preserves existing codegen |
990 | removeLocalVarsFromUsedLists(M, LocalVars: KernelUsedVariables); |
991 | KernelToReplacement[&Func] = Replacement; |
992 | |
993 | // Rewrite uses within kernel to the new struct |
994 | replaceLDSVariablesWithStruct( |
995 | M, LDSVarsToTransformArg: KernelUsedVariables, Replacement, Predicate: [&Func](Use &U) { |
996 | Instruction *I = dyn_cast<Instruction>(Val: U.getUser()); |
997 | return I && I->getFunction() == &Func; |
998 | }); |
999 | } |
1000 | return KernelToReplacement; |
1001 | } |
1002 | |
1003 | static GlobalVariable * |
1004 | buildRepresentativeDynamicLDSInstance(Module &M, LDSUsesInfoTy &LDSUsesInfo, |
1005 | Function *func) { |
1006 | // Create a dynamic lds variable with a name associated with the passed |
1007 | // function that has the maximum alignment of any dynamic lds variable |
1008 | // reachable from this kernel. Dynamic LDS is allocated after the static LDS |
1009 | // allocation, possibly after alignment padding. The representative variable |
1010 | // created here has the maximum alignment of any other dynamic variable |
1011 | // reachable by that kernel. All dynamic LDS variables are allocated at the |
1012 | // same address in each kernel in order to provide the documented aliasing |
1013 | // semantics. Setting the alignment here allows this IR pass to accurately |
1014 | // predict the exact constant at which it will be allocated. |
1015 | |
1016 | assert(isKernelLDS(func)); |
1017 | |
1018 | LLVMContext &Ctx = M.getContext(); |
1019 | const DataLayout &DL = M.getDataLayout(); |
1020 | Align MaxDynamicAlignment(1); |
1021 | |
1022 | auto UpdateMaxAlignment = [&MaxDynamicAlignment, &DL](GlobalVariable *GV) { |
1023 | if (AMDGPU::isDynamicLDS(GV: *GV)) { |
1024 | MaxDynamicAlignment = |
1025 | std::max(a: MaxDynamicAlignment, b: AMDGPU::getAlign(DL, GV)); |
1026 | } |
1027 | }; |
1028 | |
1029 | for (GlobalVariable *GV : LDSUsesInfo.indirect_access[func]) { |
1030 | UpdateMaxAlignment(GV); |
1031 | } |
1032 | |
1033 | for (GlobalVariable *GV : LDSUsesInfo.direct_access[func]) { |
1034 | UpdateMaxAlignment(GV); |
1035 | } |
1036 | |
1037 | assert(func->hasName()); // Checked by caller |
1038 | auto emptyCharArray = ArrayType::get(ElementType: Type::getInt8Ty(C&: Ctx), NumElements: 0); |
1039 | GlobalVariable *N = new GlobalVariable( |
1040 | M, emptyCharArray, false, GlobalValue::ExternalLinkage, nullptr, |
1041 | Twine("llvm.amdgcn." + func->getName() + ".dynlds" ), nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, |
1042 | false); |
1043 | N->setAlignment(MaxDynamicAlignment); |
1044 | |
1045 | assert(AMDGPU::isDynamicLDS(*N)); |
1046 | return N; |
1047 | } |
1048 | |
1049 | /// Strip "amdgpu-no-lds-kernel-id" from any functions where we may have |
1050 | /// introduced its use. If AMDGPUAttributor ran prior to the pass, we inferred |
1051 | /// the lack of llvm.amdgcn.lds.kernel.id calls. |
1052 | void removeNoLdsKernelIdFromReachable(CallGraph &CG, Function *KernelRoot) { |
1053 | KernelRoot->removeFnAttr(Kind: "amdgpu-no-lds-kernel-id" ); |
1054 | |
1055 | SmallVector<Function *> WorkList({CG[KernelRoot]->getFunction()}); |
1056 | SmallPtrSet<Function *, 8> Visited; |
1057 | bool SeenUnknownCall = false; |
1058 | |
1059 | while (!WorkList.empty()) { |
1060 | Function *F = WorkList.pop_back_val(); |
1061 | |
1062 | for (auto &CallRecord : *CG[F]) { |
1063 | if (!CallRecord.second) |
1064 | continue; |
1065 | |
1066 | Function *Callee = CallRecord.second->getFunction(); |
1067 | if (!Callee) { |
1068 | if (!SeenUnknownCall) { |
1069 | SeenUnknownCall = true; |
1070 | |
1071 | // If we see any indirect calls, assume nothing about potential |
1072 | // targets. |
1073 | // TODO: This could be refined to possible LDS global users. |
1074 | for (auto &ExternalCallRecord : *CG.getExternalCallingNode()) { |
1075 | Function *PotentialCallee = |
1076 | ExternalCallRecord.second->getFunction(); |
1077 | assert(PotentialCallee); |
1078 | if (!isKernelLDS(F: PotentialCallee)) |
1079 | PotentialCallee->removeFnAttr(Kind: "amdgpu-no-lds-kernel-id" ); |
1080 | } |
1081 | } |
1082 | } else { |
1083 | Callee->removeFnAttr(Kind: "amdgpu-no-lds-kernel-id" ); |
1084 | if (Visited.insert(Ptr: Callee).second) |
1085 | WorkList.push_back(Elt: Callee); |
1086 | } |
1087 | } |
1088 | } |
1089 | } |
1090 | |
1091 | DenseMap<Function *, GlobalVariable *> lowerDynamicLDSVariables( |
1092 | Module &M, LDSUsesInfoTy &LDSUsesInfo, |
1093 | DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS, |
1094 | DenseSet<GlobalVariable *> const &DynamicVariables, |
1095 | std::vector<Function *> const &OrderedKernels) { |
1096 | DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS; |
1097 | if (!KernelsThatIndirectlyAllocateDynamicLDS.empty()) { |
1098 | LLVMContext &Ctx = M.getContext(); |
1099 | IRBuilder<> Builder(Ctx); |
1100 | Type *I32 = Type::getInt32Ty(C&: Ctx); |
1101 | |
1102 | std::vector<Constant *> newDynamicLDS; |
1103 | |
1104 | // Table is built in the same order as OrderedKernels |
1105 | for (auto &func : OrderedKernels) { |
1106 | |
1107 | if (KernelsThatIndirectlyAllocateDynamicLDS.contains(V: func)) { |
1108 | assert(isKernelLDS(func)); |
1109 | if (!func->hasName()) { |
1110 | report_fatal_error(reason: "Anonymous kernels cannot use LDS variables" ); |
1111 | } |
1112 | |
1113 | GlobalVariable *N = |
1114 | buildRepresentativeDynamicLDSInstance(M, LDSUsesInfo, func); |
1115 | |
1116 | KernelToCreatedDynamicLDS[func] = N; |
1117 | |
1118 | markUsedByKernel(Func: func, SGV: N); |
1119 | |
1120 | auto emptyCharArray = ArrayType::get(ElementType: Type::getInt8Ty(C&: Ctx), NumElements: 0); |
1121 | auto GEP = ConstantExpr::getGetElementPtr( |
1122 | Ty: emptyCharArray, C: N, Idx: ConstantInt::get(Ty: I32, V: 0), InBounds: true); |
1123 | newDynamicLDS.push_back(x: ConstantExpr::getPtrToInt(C: GEP, Ty: I32)); |
1124 | } else { |
1125 | newDynamicLDS.push_back(x: PoisonValue::get(T: I32)); |
1126 | } |
1127 | } |
1128 | assert(OrderedKernels.size() == newDynamicLDS.size()); |
1129 | |
1130 | ArrayType *t = ArrayType::get(ElementType: I32, NumElements: newDynamicLDS.size()); |
1131 | Constant *init = ConstantArray::get(T: t, V: newDynamicLDS); |
1132 | GlobalVariable *table = new GlobalVariable( |
1133 | M, t, true, GlobalValue::InternalLinkage, init, |
1134 | "llvm.amdgcn.dynlds.offset.table" , nullptr, |
1135 | GlobalValue::NotThreadLocal, AMDGPUAS::CONSTANT_ADDRESS); |
1136 | |
1137 | for (GlobalVariable *GV : DynamicVariables) { |
1138 | for (Use &U : make_early_inc_range(Range: GV->uses())) { |
1139 | auto *I = dyn_cast<Instruction>(Val: U.getUser()); |
1140 | if (!I) |
1141 | continue; |
1142 | if (isKernelLDS(F: I->getFunction())) |
1143 | continue; |
1144 | |
1145 | replaceUseWithTableLookup(M, Builder, LookupTable: table, GV, U, OptionalIndex: nullptr); |
1146 | } |
1147 | } |
1148 | } |
1149 | return KernelToCreatedDynamicLDS; |
1150 | } |
1151 | |
1152 | bool runOnModule(Module &M) { |
1153 | CallGraph CG = CallGraph(M); |
1154 | bool Changed = superAlignLDSGlobals(M); |
1155 | |
1156 | Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M); |
1157 | |
1158 | Changed = true; // todo: narrow this down |
1159 | |
1160 | // For each kernel, what variables does it access directly or through |
1161 | // callees |
1162 | LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M); |
1163 | |
1164 | // For each variable accessed through callees, which kernels access it |
1165 | VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly; |
1166 | for (auto &K : LDSUsesInfo.indirect_access) { |
1167 | Function *F = K.first; |
1168 | assert(isKernelLDS(F)); |
1169 | for (GlobalVariable *GV : K.second) { |
1170 | LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(V: F); |
1171 | } |
1172 | } |
1173 | |
1174 | // Partition variables accessed indirectly into the different strategies |
1175 | DenseSet<GlobalVariable *> ModuleScopeVariables; |
1176 | DenseSet<GlobalVariable *> TableLookupVariables; |
1177 | DenseSet<GlobalVariable *> KernelAccessVariables; |
1178 | DenseSet<GlobalVariable *> DynamicVariables; |
1179 | partitionVariablesIntoIndirectStrategies( |
1180 | M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly, |
1181 | ModuleScopeVariables, TableLookupVariables, KernelAccessVariables, |
1182 | DynamicVariables); |
1183 | |
1184 | // If the kernel accesses a variable that is going to be stored in the |
1185 | // module instance through a call then that kernel needs to allocate the |
1186 | // module instance |
1187 | const DenseSet<Function *> KernelsThatAllocateModuleLDS = |
1188 | kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, |
1189 | VariableSet: ModuleScopeVariables); |
1190 | const DenseSet<Function *> KernelsThatAllocateTableLDS = |
1191 | kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, |
1192 | VariableSet: TableLookupVariables); |
1193 | |
1194 | const DenseSet<Function *> KernelsThatIndirectlyAllocateDynamicLDS = |
1195 | kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, |
1196 | VariableSet: DynamicVariables); |
1197 | |
1198 | GlobalVariable *MaybeModuleScopeStruct = lowerModuleScopeStructVariables( |
1199 | M, ModuleScopeVariables, KernelsThatAllocateModuleLDS); |
1200 | |
1201 | DenseMap<Function *, LDSVariableReplacement> KernelToReplacement = |
1202 | lowerKernelScopeStructVariables(M, LDSUsesInfo, ModuleScopeVariables, |
1203 | KernelsThatAllocateModuleLDS, |
1204 | MaybeModuleScopeStruct); |
1205 | |
1206 | // Lower zero cost accesses to the kernel instances just created |
1207 | for (auto &GV : KernelAccessVariables) { |
1208 | auto &funcs = LDSToKernelsThatNeedToAccessItIndirectly[GV]; |
1209 | assert(funcs.size() == 1); // Only one kernel can access it |
1210 | LDSVariableReplacement Replacement = |
1211 | KernelToReplacement[*(funcs.begin())]; |
1212 | |
1213 | DenseSet<GlobalVariable *> Vec; |
1214 | Vec.insert(V: GV); |
1215 | |
1216 | replaceLDSVariablesWithStruct(M, LDSVarsToTransformArg: Vec, Replacement, Predicate: [](Use &U) { |
1217 | return isa<Instruction>(Val: U.getUser()); |
1218 | }); |
1219 | } |
1220 | |
1221 | // The ith element of this vector is kernel id i |
1222 | std::vector<Function *> OrderedKernels = |
1223 | assignLDSKernelIDToEachKernel(M: &M, KernelsThatAllocateTableLDS, |
1224 | KernelsThatIndirectlyAllocateDynamicLDS); |
1225 | |
1226 | if (!KernelsThatAllocateTableLDS.empty()) { |
1227 | LLVMContext &Ctx = M.getContext(); |
1228 | IRBuilder<> Builder(Ctx); |
1229 | |
1230 | // The order must be consistent between lookup table and accesses to |
1231 | // lookup table |
1232 | auto TableLookupVariablesOrdered = |
1233 | sortByName(V: std::vector<GlobalVariable *>(TableLookupVariables.begin(), |
1234 | TableLookupVariables.end())); |
1235 | |
1236 | GlobalVariable *LookupTable = buildLookupTable( |
1237 | M, Variables: TableLookupVariablesOrdered, kernels: OrderedKernels, KernelToReplacement); |
1238 | replaceUsesInInstructionsWithTableLookup(M, ModuleScopeVariables: TableLookupVariablesOrdered, |
1239 | LookupTable); |
1240 | |
1241 | // Strip amdgpu-no-lds-kernel-id from all functions reachable from the |
1242 | // kernel. We may have inferred this wasn't used prior to the pass. |
1243 | // |
1244 | // TODO: We could filter out subgraphs that do not access LDS globals. |
1245 | for (Function *F : KernelsThatAllocateTableLDS) |
1246 | removeNoLdsKernelIdFromReachable(CG, KernelRoot: F); |
1247 | } |
1248 | |
1249 | DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS = |
1250 | lowerDynamicLDSVariables(M, LDSUsesInfo, |
1251 | KernelsThatIndirectlyAllocateDynamicLDS, |
1252 | DynamicVariables, OrderedKernels); |
1253 | |
1254 | // All kernel frames have been allocated. Calculate and record the |
1255 | // addresses. |
1256 | { |
1257 | const DataLayout &DL = M.getDataLayout(); |
1258 | |
1259 | for (Function &Func : M.functions()) { |
1260 | if (Func.isDeclaration() || !isKernelLDS(F: &Func)) |
1261 | continue; |
1262 | |
1263 | // All three of these are optional. The first variable is allocated at |
1264 | // zero. They are allocated by AMDGPUMachineFunction as one block. |
1265 | // Layout: |
1266 | //{ |
1267 | // module.lds |
1268 | // alignment padding |
1269 | // kernel instance |
1270 | // alignment padding |
1271 | // dynamic lds variables |
1272 | //} |
1273 | |
1274 | const bool AllocateModuleScopeStruct = |
1275 | MaybeModuleScopeStruct && |
1276 | KernelsThatAllocateModuleLDS.contains(V: &Func); |
1277 | |
1278 | auto Replacement = KernelToReplacement.find(Val: &Func); |
1279 | const bool AllocateKernelScopeStruct = |
1280 | Replacement != KernelToReplacement.end(); |
1281 | |
1282 | const bool AllocateDynamicVariable = |
1283 | KernelToCreatedDynamicLDS.contains(Val: &Func); |
1284 | |
1285 | uint32_t Offset = 0; |
1286 | |
1287 | if (AllocateModuleScopeStruct) { |
1288 | // Allocated at zero, recorded once on construction, not once per |
1289 | // kernel |
1290 | Offset += DL.getTypeAllocSize(Ty: MaybeModuleScopeStruct->getValueType()); |
1291 | } |
1292 | |
1293 | if (AllocateKernelScopeStruct) { |
1294 | GlobalVariable *KernelStruct = Replacement->second.SGV; |
1295 | Offset = alignTo(Size: Offset, A: AMDGPU::getAlign(DL, GV: KernelStruct)); |
1296 | recordLDSAbsoluteAddress(M: &M, GV: KernelStruct, Address: Offset); |
1297 | Offset += DL.getTypeAllocSize(Ty: KernelStruct->getValueType()); |
1298 | } |
1299 | |
1300 | // If there is dynamic allocation, the alignment needed is included in |
1301 | // the static frame size. There may be no reference to the dynamic |
1302 | // variable in the kernel itself, so without including it here, that |
1303 | // alignment padding could be missed. |
1304 | if (AllocateDynamicVariable) { |
1305 | GlobalVariable *DynamicVariable = KernelToCreatedDynamicLDS[&Func]; |
1306 | Offset = alignTo(Size: Offset, A: AMDGPU::getAlign(DL, GV: DynamicVariable)); |
1307 | recordLDSAbsoluteAddress(M: &M, GV: DynamicVariable, Address: Offset); |
1308 | } |
1309 | |
1310 | if (Offset != 0) { |
1311 | (void)TM; // TODO: Account for target maximum LDS |
1312 | std::string Buffer; |
1313 | raw_string_ostream SS{Buffer}; |
1314 | SS << format(Fmt: "%u" , Vals: Offset); |
1315 | |
1316 | // Instead of explictly marking kernels that access dynamic variables |
1317 | // using special case metadata, annotate with min-lds == max-lds, i.e. |
1318 | // that there is no more space available for allocating more static |
1319 | // LDS variables. That is the right condition to prevent allocating |
1320 | // more variables which would collide with the addresses assigned to |
1321 | // dynamic variables. |
1322 | if (AllocateDynamicVariable) |
1323 | SS << format(Fmt: ",%u" , Vals: Offset); |
1324 | |
1325 | Func.addFnAttr(Kind: "amdgpu-lds-size" , Val: Buffer); |
1326 | } |
1327 | } |
1328 | } |
1329 | |
1330 | for (auto &GV : make_early_inc_range(Range: M.globals())) |
1331 | if (AMDGPU::isLDSVariableToLower(GV)) { |
1332 | // probably want to remove from used lists |
1333 | GV.removeDeadConstantUsers(); |
1334 | if (GV.use_empty()) |
1335 | GV.eraseFromParent(); |
1336 | } |
1337 | |
1338 | return Changed; |
1339 | } |
1340 | |
1341 | private: |
1342 | // Increase the alignment of LDS globals if necessary to maximise the chance |
1343 | // that we can use aligned LDS instructions to access them. |
1344 | static bool superAlignLDSGlobals(Module &M) { |
1345 | const DataLayout &DL = M.getDataLayout(); |
1346 | bool Changed = false; |
1347 | if (!SuperAlignLDSGlobals) { |
1348 | return Changed; |
1349 | } |
1350 | |
1351 | for (auto &GV : M.globals()) { |
1352 | if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { |
1353 | // Only changing alignment of LDS variables |
1354 | continue; |
1355 | } |
1356 | if (!GV.hasInitializer()) { |
1357 | // cuda/hip extern __shared__ variable, leave alignment alone |
1358 | continue; |
1359 | } |
1360 | |
1361 | Align Alignment = AMDGPU::getAlign(DL, GV: &GV); |
1362 | TypeSize GVSize = DL.getTypeAllocSize(Ty: GV.getValueType()); |
1363 | |
1364 | if (GVSize > 8) { |
1365 | // We might want to use a b96 or b128 load/store |
1366 | Alignment = std::max(a: Alignment, b: Align(16)); |
1367 | } else if (GVSize > 4) { |
1368 | // We might want to use a b64 load/store |
1369 | Alignment = std::max(a: Alignment, b: Align(8)); |
1370 | } else if (GVSize > 2) { |
1371 | // We might want to use a b32 load/store |
1372 | Alignment = std::max(a: Alignment, b: Align(4)); |
1373 | } else if (GVSize > 1) { |
1374 | // We might want to use a b16 load/store |
1375 | Alignment = std::max(a: Alignment, b: Align(2)); |
1376 | } |
1377 | |
1378 | if (Alignment != AMDGPU::getAlign(DL, GV: &GV)) { |
1379 | Changed = true; |
1380 | GV.setAlignment(Alignment); |
1381 | } |
1382 | } |
1383 | return Changed; |
1384 | } |
1385 | |
1386 | static LDSVariableReplacement createLDSVariableReplacement( |
1387 | Module &M, std::string VarName, |
1388 | DenseSet<GlobalVariable *> const &LDSVarsToTransform) { |
1389 | // Create a struct instance containing LDSVarsToTransform and map from those |
1390 | // variables to ConstantExprGEP |
1391 | // Variables may be introduced to meet alignment requirements. No aliasing |
1392 | // metadata is useful for these as they have no uses. Erased before return. |
1393 | |
1394 | LLVMContext &Ctx = M.getContext(); |
1395 | const DataLayout &DL = M.getDataLayout(); |
1396 | assert(!LDSVarsToTransform.empty()); |
1397 | |
1398 | SmallVector<OptimizedStructLayoutField, 8> LayoutFields; |
1399 | LayoutFields.reserve(N: LDSVarsToTransform.size()); |
1400 | { |
1401 | // The order of fields in this struct depends on the order of |
1402 | // varables in the argument which varies when changing how they |
1403 | // are identified, leading to spurious test breakage. |
1404 | auto Sorted = sortByName(V: std::vector<GlobalVariable *>( |
1405 | LDSVarsToTransform.begin(), LDSVarsToTransform.end())); |
1406 | |
1407 | for (GlobalVariable *GV : Sorted) { |
1408 | OptimizedStructLayoutField F(GV, |
1409 | DL.getTypeAllocSize(Ty: GV->getValueType()), |
1410 | AMDGPU::getAlign(DL, GV)); |
1411 | LayoutFields.emplace_back(Args&: F); |
1412 | } |
1413 | } |
1414 | |
1415 | performOptimizedStructLayout(Fields: LayoutFields); |
1416 | |
1417 | std::vector<GlobalVariable *> LocalVars; |
1418 | BitVector IsPaddingField; |
1419 | LocalVars.reserve(n: LDSVarsToTransform.size()); // will be at least this large |
1420 | IsPaddingField.reserve(N: LDSVarsToTransform.size()); |
1421 | { |
1422 | uint64_t CurrentOffset = 0; |
1423 | for (size_t I = 0; I < LayoutFields.size(); I++) { |
1424 | GlobalVariable *FGV = static_cast<GlobalVariable *>( |
1425 | const_cast<void *>(LayoutFields[I].Id)); |
1426 | Align DataAlign = LayoutFields[I].Alignment; |
1427 | |
1428 | uint64_t DataAlignV = DataAlign.value(); |
1429 | if (uint64_t Rem = CurrentOffset % DataAlignV) { |
1430 | uint64_t Padding = DataAlignV - Rem; |
1431 | |
1432 | // Append an array of padding bytes to meet alignment requested |
1433 | // Note (o + (a - (o % a)) ) % a == 0 |
1434 | // (offset + Padding ) % align == 0 |
1435 | |
1436 | Type *ATy = ArrayType::get(ElementType: Type::getInt8Ty(C&: Ctx), NumElements: Padding); |
1437 | LocalVars.push_back(x: new GlobalVariable( |
1438 | M, ATy, false, GlobalValue::InternalLinkage, |
1439 | PoisonValue::get(T: ATy), "" , nullptr, GlobalValue::NotThreadLocal, |
1440 | AMDGPUAS::LOCAL_ADDRESS, false)); |
1441 | IsPaddingField.push_back(Val: true); |
1442 | CurrentOffset += Padding; |
1443 | } |
1444 | |
1445 | LocalVars.push_back(x: FGV); |
1446 | IsPaddingField.push_back(Val: false); |
1447 | CurrentOffset += LayoutFields[I].Size; |
1448 | } |
1449 | } |
1450 | |
1451 | std::vector<Type *> LocalVarTypes; |
1452 | LocalVarTypes.reserve(n: LocalVars.size()); |
1453 | std::transform( |
1454 | first: LocalVars.cbegin(), last: LocalVars.cend(), result: std::back_inserter(x&: LocalVarTypes), |
1455 | unary_op: [](const GlobalVariable *V) -> Type * { return V->getValueType(); }); |
1456 | |
1457 | StructType *LDSTy = StructType::create(Context&: Ctx, Elements: LocalVarTypes, Name: VarName + ".t" ); |
1458 | |
1459 | Align StructAlign = AMDGPU::getAlign(DL, GV: LocalVars[0]); |
1460 | |
1461 | GlobalVariable *SGV = new GlobalVariable( |
1462 | M, LDSTy, false, GlobalValue::InternalLinkage, PoisonValue::get(T: LDSTy), |
1463 | VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, |
1464 | false); |
1465 | SGV->setAlignment(StructAlign); |
1466 | |
1467 | DenseMap<GlobalVariable *, Constant *> Map; |
1468 | Type *I32 = Type::getInt32Ty(C&: Ctx); |
1469 | for (size_t I = 0; I < LocalVars.size(); I++) { |
1470 | GlobalVariable *GV = LocalVars[I]; |
1471 | Constant *GEPIdx[] = {ConstantInt::get(Ty: I32, V: 0), ConstantInt::get(Ty: I32, V: I)}; |
1472 | Constant *GEP = ConstantExpr::getGetElementPtr(Ty: LDSTy, C: SGV, IdxList: GEPIdx, InBounds: true); |
1473 | if (IsPaddingField[I]) { |
1474 | assert(GV->use_empty()); |
1475 | GV->eraseFromParent(); |
1476 | } else { |
1477 | Map[GV] = GEP; |
1478 | } |
1479 | } |
1480 | assert(Map.size() == LDSVarsToTransform.size()); |
1481 | return {.SGV: SGV, .LDSVarsToConstantGEP: std::move(Map)}; |
1482 | } |
1483 | |
1484 | template <typename PredicateTy> |
1485 | static void replaceLDSVariablesWithStruct( |
1486 | Module &M, DenseSet<GlobalVariable *> const &LDSVarsToTransformArg, |
1487 | const LDSVariableReplacement &Replacement, PredicateTy Predicate) { |
1488 | LLVMContext &Ctx = M.getContext(); |
1489 | const DataLayout &DL = M.getDataLayout(); |
1490 | |
1491 | // A hack... we need to insert the aliasing info in a predictable order for |
1492 | // lit tests. Would like to have them in a stable order already, ideally the |
1493 | // same order they get allocated, which might mean an ordered set container |
1494 | auto LDSVarsToTransform = sortByName(V: std::vector<GlobalVariable *>( |
1495 | LDSVarsToTransformArg.begin(), LDSVarsToTransformArg.end())); |
1496 | |
1497 | // Create alias.scope and their lists. Each field in the new structure |
1498 | // does not alias with all other fields. |
1499 | SmallVector<MDNode *> AliasScopes; |
1500 | SmallVector<Metadata *> NoAliasList; |
1501 | const size_t NumberVars = LDSVarsToTransform.size(); |
1502 | if (NumberVars > 1) { |
1503 | MDBuilder MDB(Ctx); |
1504 | AliasScopes.reserve(N: NumberVars); |
1505 | MDNode *Domain = MDB.createAnonymousAliasScopeDomain(); |
1506 | for (size_t I = 0; I < NumberVars; I++) { |
1507 | MDNode *Scope = MDB.createAnonymousAliasScope(Domain); |
1508 | AliasScopes.push_back(Elt: Scope); |
1509 | } |
1510 | NoAliasList.append(in_start: &AliasScopes[1], in_end: AliasScopes.end()); |
1511 | } |
1512 | |
1513 | // Replace uses of ith variable with a constantexpr to the corresponding |
1514 | // field of the instance that will be allocated by AMDGPUMachineFunction |
1515 | for (size_t I = 0; I < NumberVars; I++) { |
1516 | GlobalVariable *GV = LDSVarsToTransform[I]; |
1517 | Constant *GEP = Replacement.LDSVarsToConstantGEP.at(Val: GV); |
1518 | |
1519 | GV->replaceUsesWithIf(New: GEP, ShouldReplace: Predicate); |
1520 | |
1521 | APInt APOff(DL.getIndexTypeSizeInBits(Ty: GEP->getType()), 0); |
1522 | GEP->stripAndAccumulateInBoundsConstantOffsets(DL, Offset&: APOff); |
1523 | uint64_t Offset = APOff.getZExtValue(); |
1524 | |
1525 | Align A = |
1526 | commonAlignment(A: Replacement.SGV->getAlign().valueOrOne(), Offset); |
1527 | |
1528 | if (I) |
1529 | NoAliasList[I - 1] = AliasScopes[I - 1]; |
1530 | MDNode *NoAlias = |
1531 | NoAliasList.empty() ? nullptr : MDNode::get(Context&: Ctx, MDs: NoAliasList); |
1532 | MDNode *AliasScope = |
1533 | AliasScopes.empty() ? nullptr : MDNode::get(Context&: Ctx, MDs: {AliasScopes[I]}); |
1534 | |
1535 | refineUsesAlignmentAndAA(Ptr: GEP, A, DL, AliasScope, NoAlias); |
1536 | } |
1537 | } |
1538 | |
1539 | static void refineUsesAlignmentAndAA(Value *Ptr, Align A, |
1540 | const DataLayout &DL, MDNode *AliasScope, |
1541 | MDNode *NoAlias, unsigned MaxDepth = 5) { |
1542 | if (!MaxDepth || (A == 1 && !AliasScope)) |
1543 | return; |
1544 | |
1545 | for (User *U : Ptr->users()) { |
1546 | if (auto *I = dyn_cast<Instruction>(Val: U)) { |
1547 | if (AliasScope && I->mayReadOrWriteMemory()) { |
1548 | MDNode *AS = I->getMetadata(KindID: LLVMContext::MD_alias_scope); |
1549 | AS = (AS ? MDNode::getMostGenericAliasScope(A: AS, B: AliasScope) |
1550 | : AliasScope); |
1551 | I->setMetadata(KindID: LLVMContext::MD_alias_scope, Node: AS); |
1552 | |
1553 | MDNode *NA = I->getMetadata(KindID: LLVMContext::MD_noalias); |
1554 | NA = (NA ? MDNode::intersect(A: NA, B: NoAlias) : NoAlias); |
1555 | I->setMetadata(KindID: LLVMContext::MD_noalias, Node: NA); |
1556 | } |
1557 | } |
1558 | |
1559 | if (auto *LI = dyn_cast<LoadInst>(Val: U)) { |
1560 | LI->setAlignment(std::max(a: A, b: LI->getAlign())); |
1561 | continue; |
1562 | } |
1563 | if (auto *SI = dyn_cast<StoreInst>(Val: U)) { |
1564 | if (SI->getPointerOperand() == Ptr) |
1565 | SI->setAlignment(std::max(a: A, b: SI->getAlign())); |
1566 | continue; |
1567 | } |
1568 | if (auto *AI = dyn_cast<AtomicRMWInst>(Val: U)) { |
1569 | // None of atomicrmw operations can work on pointers, but let's |
1570 | // check it anyway in case it will or we will process ConstantExpr. |
1571 | if (AI->getPointerOperand() == Ptr) |
1572 | AI->setAlignment(std::max(a: A, b: AI->getAlign())); |
1573 | continue; |
1574 | } |
1575 | if (auto *AI = dyn_cast<AtomicCmpXchgInst>(Val: U)) { |
1576 | if (AI->getPointerOperand() == Ptr) |
1577 | AI->setAlignment(std::max(a: A, b: AI->getAlign())); |
1578 | continue; |
1579 | } |
1580 | if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: U)) { |
1581 | unsigned BitWidth = DL.getIndexTypeSizeInBits(Ty: GEP->getType()); |
1582 | APInt Off(BitWidth, 0); |
1583 | if (GEP->getPointerOperand() == Ptr) { |
1584 | Align GA; |
1585 | if (GEP->accumulateConstantOffset(DL, Offset&: Off)) |
1586 | GA = commonAlignment(A, Offset: Off.getLimitedValue()); |
1587 | refineUsesAlignmentAndAA(Ptr: GEP, A: GA, DL, AliasScope, NoAlias, |
1588 | MaxDepth: MaxDepth - 1); |
1589 | } |
1590 | continue; |
1591 | } |
1592 | if (auto *I = dyn_cast<Instruction>(Val: U)) { |
1593 | if (I->getOpcode() == Instruction::BitCast || |
1594 | I->getOpcode() == Instruction::AddrSpaceCast) |
1595 | refineUsesAlignmentAndAA(Ptr: I, A, DL, AliasScope, NoAlias, MaxDepth: MaxDepth - 1); |
1596 | } |
1597 | } |
1598 | } |
1599 | }; |
1600 | |
1601 | class AMDGPULowerModuleLDSLegacy : public ModulePass { |
1602 | public: |
1603 | const AMDGPUTargetMachine *TM; |
1604 | static char ID; |
1605 | |
1606 | AMDGPULowerModuleLDSLegacy(const AMDGPUTargetMachine *TM_ = nullptr) |
1607 | : ModulePass(ID), TM(TM_) { |
1608 | initializeAMDGPULowerModuleLDSLegacyPass(*PassRegistry::getPassRegistry()); |
1609 | } |
1610 | |
1611 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
1612 | if (!TM) |
1613 | AU.addRequired<TargetPassConfig>(); |
1614 | } |
1615 | |
1616 | bool runOnModule(Module &M) override { |
1617 | if (!TM) { |
1618 | auto &TPC = getAnalysis<TargetPassConfig>(); |
1619 | TM = &TPC.getTM<AMDGPUTargetMachine>(); |
1620 | } |
1621 | |
1622 | return AMDGPULowerModuleLDS(*TM).runOnModule(M); |
1623 | } |
1624 | }; |
1625 | |
1626 | } // namespace |
1627 | char AMDGPULowerModuleLDSLegacy::ID = 0; |
1628 | |
1629 | char &llvm::AMDGPULowerModuleLDSLegacyPassID = AMDGPULowerModuleLDSLegacy::ID; |
1630 | |
1631 | INITIALIZE_PASS_BEGIN(AMDGPULowerModuleLDSLegacy, DEBUG_TYPE, |
1632 | "Lower uses of LDS variables from non-kernel functions" , |
1633 | false, false) |
1634 | INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) |
1635 | INITIALIZE_PASS_END(AMDGPULowerModuleLDSLegacy, DEBUG_TYPE, |
1636 | "Lower uses of LDS variables from non-kernel functions" , |
1637 | false, false) |
1638 | |
1639 | ModulePass * |
1640 | llvm::createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM) { |
1641 | return new AMDGPULowerModuleLDSLegacy(TM); |
1642 | } |
1643 | |
1644 | PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M, |
1645 | ModuleAnalysisManager &) { |
1646 | return AMDGPULowerModuleLDS(TM).runOnModule(M) ? PreservedAnalyses::none() |
1647 | : PreservedAnalyses::all(); |
1648 | } |
1649 | |